DataCamp offer interactive courses related to Python Programming. Since R Markdown documents can run simple Python code chunks (though the data is not accessible to future chunks, a large difference from R Markdown for R), this document attempts to summarize notes from the modules when possible.
Topic areas summarized include:
The complete version as of July 31, 2017 has been archived as DataCamp_PythonNotes_v001. Archive files for DataCamp_Python_ImportClean_v002 and DataCamp_Python_Programming_v002 have also been created to contain summaries of those areas.
This document will continue to include:
Chapter 1 - Data Ingestion and Inspection
Review of pandas data frames - tabular data structure with labelled rows and columns:
Building DataFrames from scratch:
Importing and exporting data - example using ISSN_D_tot.csv, sunspot data:
Plotting with pandas - can plot either the panda Series or the underlying numpy array - plt.plot() followed by plt.show() works on either/both:
Example code includes:
myPath = "./PythonInputFiles/"
# NEED TO CREATE FRAME df - "Total Population" - [3034970564.0, 3684822701.0, 4436590356.0, 5282715991.0, 6115974486.0, 6924282937.0] indexed by "Year" [1960, 1970, 1980, 1990, 2000, 2010]
# Import numpy
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
df = pd.DataFrame( {"Total Population":[3034970564.0, 3684822701.0, 4436590356.0, 5282715991.0, 6115974486.0, 6924282937.0], "Year":[1960, 1970, 1980, 1990, 2000, 2010]} )
df.index = df["Year"]
del df["Year"]
world_population = df.copy()
# Create array of DataFrame values: np_vals
np_vals = df.values
# Create new array of base 10 logarithm values: np_vals_log10
np_vals_log10 = np.log10(np_vals)
# Create array of new DataFrame by passing df to np.log10(): df_log10
df_log10 = np.log10(df)
# Print original and new data containers
print(type(np_vals), type(np_vals_log10))
print(type(df), type(df_log10))
list_keys = ['Country', 'Total']
list_values = [['United States', 'Soviet Union', 'United Kingdom'], [1118, 473, 273]]
# Zip the 2 lists together into one list of (key,value) tuples: zipped
zipped = list(zip(list_keys, list_values))
# Inspect the list using print()
print(zipped)
# Build a dictionary with the zipped list: data
data = dict(zipped)
# Build and inspect a DataFrame from the dictionary: df
df = pd.DataFrame(data)
print(df)
tempDict = {"a":[1980, 1981, 1982] , "b":["Blondie", "Chris Cross", "Joan Jett"] , "c":["Call Me", "Arthurs Theme", "I Love Rock and Roll"], "d":[6, 3, 7]}
df = pd.DataFrame(tempDict)
# Build a list of labels: list_labels
list_labels = ['year', 'artist', 'song', 'chart weeks']
# Assign the list of labels to the columns attribute: df.columns
df.columns = list_labels
print(df)
cities = ['Manheim', 'Preston park', 'Biglerville', 'Indiana', 'Curwensville', 'Crown', 'Harveys lake', 'Mineral springs', 'Cassville', 'Hannastown', 'Saltsburg', 'Tunkhannock', 'Pittsburgh', 'Lemasters', 'Great bend']
# Make a string with the value 'PA': state
state = "PA"
# Construct a dictionary: data
data = {'state':state, 'city':cities}
# Construct a DataFrame from dictionary data: df
df = pd.DataFrame(data)
# Print the DataFrame
print(df)
# "world_population.csv is the same 6x2 population data as per the above
# Read in the file: df1
# df1 = pd.read_csv("world_population.csv")
# Skipped this part
# Create a list of the new column labels: new_labels
# new_labels = ["year", "population"]
# Read in the file, specifying the header and names parameters: df2
# df2 = pd.read_csv('world_population.csv', header=0, names=new_labels)
# Skipped this step
# Print both the DataFrames
# print(df1)
# print(df2)
# DO NOT HAVE the messy data - file_messy is "messy_stock_data.tsv"
# Read the raw file as-is: df1
# df1 = pd.read_csv(file_messy)
# Print the output of df1.head()
# print(df1.head())
# Read in the file with the correct parameters: df2
# df2 = pd.read_csv(file_messy, delimiter="\t", header=3, comment="#")
# Print the output of df2.head()
# print(df2.head())
# Save the cleaned up DataFrame to a CSV file without the index
# df2.to_csv(file_clean, index=False)
# Save the cleaned up DataFrame to an excel file without the index
# df2.to_excel('file_clean.xlsx', index=False)
# DO NOT HAVE DataFrame df, which is a 744x1 of "Temperature (deg F)" indexed automatically as 0-743
# Downloaded raw METAR data for KAUS using 0801100000 UTC - 0831102359 UTC
# Coded to a cleaned CSV as per below
#
#
# metarList = []
# for line in open(myPath + "KAUS_Metar_Aug2010.txt", "r"): metarList.append(line.rstrip())
# cleanMetar = []
# cleanLine = ""
# for recs in metarList:
# if recs.startswith("#") or recs == "" : continue
# if recs.startswith("2") :
# if cleanLine != "" :
# cleanMetar.append(cleanLine)
# cleanLine = recs
# else:
# cleanLine = cleanLine + " " + recs.strip()
#
# cleanMetar.append(cleanLine)
#
# useMetar = [textBlock for textBlock in cleanMetar if "METAR" in textBlock]
# useSpeci = [textBlock for textBlock in cleanMetar if "SPECI" in textBlock]
# assert len(cleanMetar) == len(useMetar) + len(useSpeci)
#
# import re
#
# metTime = []
# tempF = []
# dewF = []
# altMG = []
#
# for textBlock in useMetar:
# if textBlock.endswith("NIL="):
# print("Not using line", textBlock)
# continue
#
# # print(textBlock)
# dateUTC = textBlock.split()[0]
#
# tempData = re.findall("T([0-9][0-9][0-9][0-9])([0-9][0-9][0-9][0-9])", textBlock)
# assert len(tempData) == 1
# a, b = tempData[0]
# tempC = float(a[1:])/10
# dewC = float(b[1:])/10
# if a[0] == "1" : tempC = -tempC
# if b[0] == "1" : dewC = -dewC
#
# tF = round((9/5) * tempC + 32, 0)
# dF = round((9/5) * dewC + 32, 0)
#
# altData = re.findall("A([0-9][0-9][0-9][0-9])", textBlock)
# assert len(altData) == 1
#
# aMG = float(altData[0]) / 100
# # print(dateUTC, tempC, dewC, altMG, tempF, dewF)
#
# metTime.append(dateUTC)
# tempF.append(tF)
# dewF.append(dF)
# altMG.append(aMG)
#
# metarKAUS = pd.DataFrame( {"DateTime (UTC)":metTime, "Temperature (deg F)":tempF , "Dew Point (deg F)":dewF, "Pressure (atm)":altMG} )
# metarKAUS.index = metarKAUS["DateTime (UTC)"]
# del metarKAUS["DateTime (UTC)"]
#
# metarKAUS.to_csv(myPath + "KAUS_Metar_Aug2010_Clean.csv")
# Create or import the data
# import random
# df = pd.DataFrame( {"Temperature (deg F)":np.random.randint(low=60, high=100, size=744)} )
dfFull = pd.read_csv(myPath + "KAUS_Metar_Aug2010_Clean.csv")
df = dfFull.loc[:, "Temperature (deg F)"]
# Create a plot with color='red'
df.plot(color="red")
# Add a title
plt.title('Temperature in Austin')
# Specify the x-axis label
plt.xlabel('Hours since midnight August 1, 2010')
# Specify the y-axis label
plt.ylabel('Temperature (degrees F)')
# Display the plot
# plt.show()
plt.savefig("_dummyPy050.png", bbox_inches="tight")
plt.clf()
# DO NOT HAVE DataFrame df, which is a 744x3 of "Temperature (deg F)", "Dew Point (deg F)", "Pressure (atm)" indexed automatically as 0-743
# df["Dew Point (deg F)"] = df.iloc[:, 0] + np.random.randint(low=-30, high=0, size=744)
# df["Pressure (atm)"] = np.random.randint(low=980, high=1020, size=744)
# Use dfFull rather than manufacturing data
df = dfFull.copy()
df.index = [x[6:8] + "-" + "{0:0>2}".format(str(int(x[9:10]) + 1)) + "Z" for x in df["DateTime (UTC)"].astype(str)]
del df["DateTime (UTC)"]
# Plot all columns (default)
df.plot()
# plt.show()
plt.savefig("_dummyPy051.png", bbox_inches="tight")
plt.clf()
# Plot all columns as subplots
df.plot(subplots=True)
# plt.show()
plt.savefig("_dummyPy052.png", bbox_inches="tight")
plt.clf()
# Plot just the Dew Point data
column_list1 = ['Dew Point (deg F)']
df[column_list1].plot()
# plt.show()
plt.savefig("_dummyPy053.png", bbox_inches="tight")
plt.clf()
# Plot the Dew Point and Temperature data, but not the Pressure data
column_list2 = ['Temperature (deg F)','Dew Point (deg F)']
df[column_list2].plot()
# plt.show()
plt.savefig("_dummyPy054.png", bbox_inches="tight")
plt.clf()
## <class 'numpy.ndarray'> <class 'numpy.ndarray'>
## <class 'pandas.core.frame.DataFrame'> <class 'pandas.core.frame.DataFrame'>
## [('Country', ['United States', 'Soviet Union', 'United Kingdom']), ('Total', [1118, 473, 273])]
## Country Total
## 0 United States 1118
## 1 Soviet Union 473
## 2 United Kingdom 273
## year artist song chart weeks
## 0 1980 Blondie Call Me 6
## 1 1981 Chris Cross Arthurs Theme 3
## 2 1982 Joan Jett I Love Rock and Roll 7
## city state
## 0 Manheim PA
## 1 Preston park PA
## 2 Biglerville PA
## 3 Indiana PA
## 4 Curwensville PA
## 5 Crown PA
## 6 Harveys lake PA
## 7 Mineral springs PA
## 8 Cassville PA
## 9 Hannastown PA
## 10 Saltsburg PA
## 11 Tunkhannock PA
## 12 Pittsburgh PA
## 13 Lemasters PA
## 14 Great bend PA
Temperature - Austin, TX (Aug 2010):
METAR plots - Austin, TX (Aug 2010):
METAR Sub-plots - Austin, TX (Aug 2010):
Dew Point - Austin, TX (Aug 2010):
Temperature and Dew Point - Austin, TX (Aug 2010):
Chapter 2 - Exploratory Data Analysis
Visual exploratory data analysis - using Fisher’s iris flower data (similar to the R dataset):
Statistical exploratory data analysis - starting with the .describe() method which is very similar to summary() in R - counts, means, quartiles, and the like:
Separating populations with boolean indexing - subsets of columns and/or rows for plotting, summarizing, and the like:
Example code includes:
myPath = "./PythonInputFiles/"
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
dummyStock = pd.read_csv(myPath + "StockChart_20170615.csv", header=None)
dummyStock.columns = ["Symbol", "Data"]
# Data is a single space-delimited string of Date - Open - High - Low - Close - Volume
dummyStockSplit = dummyStock["Data"].str.split()
dummyDates = [datetime.strptime(x[0], "%m/%d/%Y") for x in dummyStockSplit]
dummyClose = [float(x[4]) for x in dummyStockSplit]
dfStock = pd.DataFrame( {"date":dummyDates, "symbol":dummyStock["Symbol"] , "close":dummyClose} )
df = dfStock.pivot(index="date", columns="symbol", values="close").resample("M").max()
# df is 12 x 4 with columns Month-AAPL-GOOG-IBM
# Create a list of y-axis column names: y_columns
y_columns = ["AAPL", "IBM"]
# Generate a line plot
df.plot(y=y_columns)
# Add the title
plt.title('Monthly stock prices')
# Add the y-axis label
plt.ylabel('Price ($US)')
# Display the plot
# plt.show()
plt.savefig("_dummyPy055.png", bbox_inches="tight")
plt.clf()
# Here, df appears to be the mtcars data
# Saved file from R
df = pd.read_csv(myPath + "mtcars.csv", index_col=0)
# sizes is a pre-defined np.array(), not sure of what
sizes = df["cyl"]
# Generate a scatter plot
df.plot(kind="scatter", x='hp', y='mpg', s=5*(sizes-3))
# Add the title
plt.title('Fuel efficiency vs Horse-power')
# Add the x-axis label
plt.xlabel('Horse-power')
# Add the y-axis label
plt.ylabel('Fuel efficiency (mpg)')
# Display the plot
# plt.show()
plt.savefig("_dummyPy056.png", bbox_inches="tight")
plt.clf()
# Make a list of the column names to be plotted: cols
cols = ["wt", "mpg"]
# Generate the box plots
df[cols].plot(kind="box", subplots=True)
# Display the plot
# plt.show()
plt.savefig("_dummyPy057.png", bbox_inches="tight")
plt.clf()
# Here, df is the tipping data from the Seaborn package, with emphasis on the column "fraction"
# Create a reasonable analog based on the pre-made CSV
tips = pd.read_csv(myPath + "tips.csv")
tips.sex = tips["sex"].astype("category")
tips.smoker = tips["smoker"].astype("category")
tips['total_bill'] = pd.to_numeric(tips["total_bill"], errors="coerce")
tips['tip'] = pd.to_numeric(tips["tip"], errors="coerce")
tips["fraction"] = tips["tip"] / tips["total_bill"]
df = tips.copy()
# This formats the plots such that they appear on separate rows
fig, axes = plt.subplots(nrows=2, ncols=1)
# Plot the PDF and CDF on the two axes
df.fraction.plot(ax=axes[0], kind='hist', bins=30, normed=True, range=(0,.3))
df.fraction.plot(ax=axes[1], kind="hist", bins=30, normed=True, cumulative=True, range=(0,.3))
# plt.show()
plt.savefig("_dummyPy058.png", bbox_inches="tight")
plt.clf()
# df is degrees by gender from http://nces.ed.gov/programs/digest/2013menu_tables.asp
# DO NOT HAVE DATASET - skip
# Print the minimum value of the Engineering column
# print(df["Engineering"].min())
# Print the maximum value of the Engineering column
# print(df["Engineering"].max())
# Construct the mean percentage per year: mean
# mean = df.mean(axis="columns")
# Plot the average percentage per year
# mean.plot()
# Display the plot
# plt.show()
# Now, df appears to be the Titanic dataset (not the table)
df = pd.read_csv(myPath + "titanic.csv")
# Print summary statistics of the fare column with .describe()
print(df["Fare"].describe())
# Generate a box plot of the fare column
df["Fare"].plot(kind="box")
# Show the plot
# plt.show()
plt.savefig("_dummyPy059.png", bbox_inches="tight")
plt.clf()
# Now, df is the life-expectancy Gapminder data as 260x219
# Needs the encoding to load
df = pd.read_csv(myPath + "gapminder.csv", encoding="latin-1", index_col=0).pivot_table(index="country", columns="year", values="life_expectancy")
# Print the number of countries reported in 2015
print(df[2015].count())
# Print the 5th and 95th percentiles
print(df.quantile([0.05, 0.95]))
# Generate a box plot
years = [1800, 1850, 1900, 1950, 2000]
df[years].plot(kind='box')
# plt.show()
plt.savefig("_dummyPy060.png", bbox_inches="tight")
plt.clf()
# Now, df is Pittsburgh weather data from https://www.wunderground.com/history/
# NEED TO GET THIS DATA
# january and march are both 31x2 with the columns being Date-Temperature
df = pd.read_csv(myPath + "KPIT_Temps_Small.csv")
january = df[["Date", "jan"]]
march = df[["Date", "mar"]]
# Print the mean of the January and March data
print(january.mean(), "\n", march.mean())
# Print the standard deviation of the January and March data
print(january.std(), "\n", march.std())
# Here, df is again automobile data of shape (392, 9)
# NEED TO GET THIS DATA - using MASS::Cars93 instead
tempDF = pd.read_csv(myPath + "Cars93.csv")
tempDF["Origin"]
df = tempDF[["Origin", "MPG.city", "MPG.highway", "Weight", "Horsepower"]]
# Compute the global mean and global standard deviation: global_mean, global_std
global_mean = df.mean()
global_std = df.std()
# Filter the US population from the origin column: us
us = df.loc[df["Origin"] == "USA", :]
# Compute the US mean and US standard deviation: us_mean, us_std
us_mean = us.mean()
us_std = us.std()
# Print the differences
print(us_mean - global_mean)
print(us_std - global_std)
# titanic is 1309x14 of data from the titanic
titanic = pd.read_csv(myPath + "titanic.csv", index_col=0)
# Display the box plots on 3 separate rows and 1 column
fig, axes = plt.subplots(nrows=3, ncols=1)
# Generate a box plot of the fare prices for the First passenger class
titanic.loc[titanic['Pclass'] == 1].plot(ax=axes[0], y='Fare', kind='box')
# Generate a box plot of the fare prices for the Second passenger class
titanic.loc[titanic['Pclass'] == 2].plot(ax=axes[1], y='Fare', kind='box')
# Generate a box plot of the fare prices for the Third passenger class
titanic.loc[titanic['Pclass'] == 3].plot(ax=axes[2], y='Fare', kind='box')
# Display the plot
# plt.show()
plt.savefig("_dummyPy061.png", bbox_inches="tight")
plt.clf()
## count 891.000000
## mean 32.204208
## std 49.693429
## min 0.000000
## 25% 7.910400
## 50% 14.454200
## 75% 31.000000
## max 512.329200
## Name: Fare, dtype: float64
## 208
## year 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 \
## 0.05 25.40 25.30 25.20 25.2 25.2 25.40 25.40 25.40 25.3 25.3
## 0.95 37.92 37.35 38.37 38.0 38.3 38.37 38.37 38.37 38.0 38.0
##
## year ... 2007 2008 2009 2010 2011 2012 2013 2014 \
## 0.05 ... 53.07 53.60 54.235 54.935 55.97 56.335 56.705 56.87
## 0.95 ... 80.73 80.93 81.200 81.365 81.60 81.665 81.830 82.00
##
## year 2015 2016
## 0.05 57.855 59.2555
## 0.95 82.100 82.1650
##
## [2 rows x 217 columns]
## Date 16.000000
## jan 26.096774
## dtype: float64
## Date 16.000000
## mar 43.612903
## dtype: float64
## Date 9.092121
## jan 10.514608
## dtype: float64
## Date 9.092121
## mar 8.503636
## dtype: float64
## MPG.city -1.407258
## MPG.highway -0.940188
## Weight 122.409274
## Horsepower 3.692876
## dtype: float64
## MPG.city -1.625356
## MPG.highway -1.180389
## Weight -24.668815
## Horsepower 2.080330
## dtype: float64
Maximum Stock Price by Month:
MPG vs HP (sized by Cylinders):
Box Plots for Weight and MPG (mtcars):
PDF and CDF for Tip as Percentage of Total Bill:
Box Plots for Titanic Fares:
Box Plot for Life Expectancy by Country (Gapminder):
Titanic Fares by Class (First, Second, Third):
Chapter 3 - Time series in pandas
Indexing pandas time series - dates and times are stored in datetime options:
Resampling pandas time series - taking statistical measures over different time intervals:
Manipulating pandas time series - changing the data in one or more columns:
Visualizing pandas time series - additional plotting techniques such line types, plot types, and sub-plots:
Example code includes:
myPath = "./PythonInputFiles/"
import pandas as pd
import matplotlib.pyplot as plt
# GREAT data is available at https://mesonet.agron.iastate.edu/request/download.phtml?network=IL_ASOS
# Downloaded KORD data from 2010 to myPath + "KORD_2010_from_IAState.txt"
# First 5 rows are commented, the sixth row is the header, and the next 10,443 rows are the data
# Load the file
tmpORD = pd.read_csv(myPath + "KORD_2010_from_IAState.txt", header=5)
tmpORD.columns = tmpORD.columns.str.strip()
isMETAR = tmpORD.loc[:, "valid"].str.contains(":51") # KORD METAR are taken at xx:51
useORD = tmpORD.loc[isMETAR, :] # ends as 8709 x 22, probably the METAR check missed a few at "off" times
date_list = useORD["valid"]
temperature_list = list(useORD["tmpf"])
# This is 8,759 temperature observations refelecting 20100101 00:00 through 20101231 23:00 on an hourly basis
# Prepare a format string: time_format
time_format = '%Y-%m-%d %H:%M'
# Convert date_list into a datetime object: my_datetimes
my_datetimes = pd.to_datetime(date_list, format=time_format)
# Construct a pandas Series using temperature_list and my_datetimes: time_series
# Something to explore later - this produced all np.nan if temperature_list were already a Series
ts0 = pd.Series(temperature_list, index=my_datetimes)
# Extract the hour from 9pm to 10pm on '2010-10-11': ts1
ts1 = ts0.loc['2010-10-11 20:51:00']
# Extract '2010-07-04' from ts0: ts2
ts2 = ts0.loc["2010-07-04"]
# Extract data from '2010-12-15' to '2010-12-31': ts3
ts3 = ts0.loc["2010-12-15":"2010-12-31"]
# Reindex without fill method: ts3
ts3 = ts2.reindex(ts0.index)
# Reindex with fill method, using forward fill: ts4
ts4 = ts2.reindex(ts0.index, method="ffill")
# Combine ts1 + ts2: sum12
sum12 = ts1 + ts2
# Combine ts1 + ts3: sum13
sum13 = ts1 + ts3
# Combine ts1 + ts4: sum14
sum14 = ts1 + ts4
# Still working with the temperature data, now renamed as df [technically, same index but containing Temperature-Dew Point-Pressure]
df = useORD[["tmpf", "dwpf", "alti"]]
df.index = my_datetimes
df.columns = ["Temperature", "DewPoint", "Pressure"]
saveWeather = df.copy()
# Downsample to 6 hour data and aggregate by mean: df1
df1 = df["Temperature"].resample("6H").mean()
# Downsample to daily data and count the number of data points: df2
df2 = df["Temperature"].resample("D").count()
# Extract temperature data for August: august
august = df.loc["2010-08", "Temperature"]
# Downsample to obtain only the daily highest temperatures in August: august_highs
august_highs = august.resample("D").max()
# Extract temperature data for February: february
february = df.loc["2010-02", "Temperature"]
# Downsample to obtain the daily lowest temperatures in February: february_lows
february_lows = february.resample("D").min()
# Extract data from 2010-Aug-01 to 2010-Aug-15: unsmoothed
unsmoothed = df['Temperature']["2010-08-01":"2010-08-15"]
# Apply a rolling mean with a 24 hour window: smoothed
smoothed = unsmoothed.rolling(window=24).mean()
# Create a new DataFrame with columns smoothed and unsmoothed: august
august = pd.DataFrame({'smoothed':smoothed, 'unsmoothed':unsmoothed})
# Plot both smoothed and unsmoothed data using august.plot().
august.plot()
# plt.show()
plt.savefig("_dummyPy062.png", bbox_inches="tight")
plt.clf()
# Extract the August 2010 data: august
august = df['Temperature']["2010-08"]
# Resample to daily data, aggregating by max: daily_highs
daily_highs = august.resample("D").max()
# Use a rolling 7-day window with method chaining to smooth the daily high temperatures in August
daily_highs_smoothed = daily_highs.rolling(window=7).mean()
print(daily_highs_smoothed)
# Plot the summer data
df = saveWeather.copy()
df.Temperature["2010-Jun":"2010-Aug"].plot()
# plt.show()
plt.savefig("_dummyPy063.png", bbox_inches="tight")
plt.clf()
# Plot the one week data
df.Temperature['2010-06-10':'2010-06-17'].plot()
# plt.show()
plt.savefig("_dummyPy064.png", bbox_inches="tight")
plt.clf()
# Now, df is 1741x17 of airline/airport data
# Saved the June 2011 data from hflights::hflights to csv
dfJun = pd.read_csv(myPath + "junFlights.csv")
dfJun["useMonth"] = ["{0:0>2}".format(x) for x in dfJun["Month"]]
dfJun["useDate"] = ["{0:0>2}".format(x) for x in dfJun["DayofMonth"]]
keyDates = dfJun["Year"].astype(str) + dfJun["useMonth"] + dfJun["useDate"]
time_format = '%Y%m%d'
useDates = pd.to_datetime(keyDates, format=time_format)
dfJun.index = useDates
df = dfJun[["DayOfWeek", "Dest", "DepTime", "ArrTime", "UniqueCarrier", "FlightNum"]]
df.columns = ["Weekday", "Destination Airport", "Wheels-off Time", "Arrival Time", "Carrier", "Flight"]
# Strip extra whitespace from the column names: df.columns
df.columns = df.columns.str.strip()
# Extract data for which the destination airport is Dallas: dallas
dallas = df['Destination Airport'].str.contains("DAL")
# Compute the total number of Dallas departures each day: daily_departures
daily_departures = dallas.resample("D").sum()
# Generate the summary statistics for daily Dallas departures: stats
stats = daily_departures.describe()
print(stats)
# Reset the index of ts2 to ts1, and then use linear interpolation to fill in the NaNs: ts2_interp
# ts2_interp = ts2.reindex(ts1.index).interpolate("linear")
# Compute the absolute difference of ts1 and ts2_interp: differences
# differences = np.abs(ts2_interp - ts1)
# Generate and print summary statistics of the differences
# print(differences.describe())
# Buid a Boolean mask to filter out all the 'LAX' departure flights: mask
import numpy as np
mask = df['Destination Airport'] == "LAX"
# Use the mask to subset the data: la
la = df[mask].dropna()
la["Date"] = la.index.astype(str)
la["Wheel Time"] = ["{0:0>4}".format(int(x)) for x in la["Wheels-off Time"]]
# Combine two columns of data to create a datetime series: times_tz_none
times_tz_none = pd.to_datetime(la["Date"] + " " + la["Wheel Time"])
# Localize the time to US/Central: times_tz_central
times_tz_central = times_tz_none.dt.tz_localize("US/Central")
# Convert the datetimes from US/Central to US/Pacific
times_tz_pacific = times_tz_central.dt.tz_convert("US/Pacific")
newDF = pd.DataFrame( {"Date":keyDates, "Carrier":list(df["Carrier"]), "nFlight":1} )
useCarrier = [x in ["XE", "CO", "WN", "OO"] for x in newDF["Carrier"]]
useDF = newDF.loc[useCarrier].pivot_table(index="Date", columns=["Carrier"], values=["nFlight"], aggfunc=sum)
# Plot the raw data before setting the datetime index
useDF.plot()
# plt.show()
plt.savefig("_dummyPy065.png", bbox_inches="tight")
plt.clf()
# Convert the 'Date' column into a collection of datetime objects: df.Date
useDF["Date"] = pd.to_datetime(useDF.index)
# Set the index to be the converted 'Date' column
useDF.set_index("Date", inplace=True) # inplace=True makes the conversion in place; no need to reassign
# Re-plot the DataFrame to see that the axis is now datetime aware!
useDF.plot()
# plt.show()
plt.savefig("_dummyPy066.png", bbox_inches="tight")
plt.clf()
## valid
## 2010-08-01 NaN
## 2010-08-02 NaN
## 2010-08-03 NaN
## 2010-08-04 NaN
## 2010-08-05 NaN
## 2010-08-06 NaN
## 2010-08-07 83.094286
## 2010-08-08 83.402857
## 2010-08-09 84.122857
## 2010-08-10 84.560000
## 2010-08-11 85.434286
## 2010-08-12 86.591429
## 2010-08-13 88.160000
## 2010-08-14 88.880000
## 2010-08-15 88.288571
## 2010-08-16 87.157143
## 2010-08-17 85.588571
## 2010-08-18 84.585714
## 2010-08-19 84.020000
## 2010-08-20 84.020000
## 2010-08-21 83.711429
## 2010-08-22 83.428571
## 2010-08-23 83.145714
## 2010-08-24 83.865714
## 2010-08-25 83.300000
## 2010-08-26 82.014286
## 2010-08-27 81.165714
## 2010-08-28 81.602857
## 2010-08-29 83.454286
## 2010-08-30 84.868571
## 2010-08-31 86.437143
## Freq: D, Name: Temperature, dtype: float64
## count 30.00000
## mean 26.30000
## std 4.05267
## min 17.00000
## 25% 25.75000
## 50% 28.00000
## 75% 28.00000
## max 30.00000
## Name: Destination Airport, dtype: float64
Chicago Temperatures (KORD) - August 2010:
Chicago Temperatures (KORD) - Summer 2010:
Chicago Temperatures (KORD) - June 10-17, 2010:
Flights per Day (Top 4 Carriers) - Houston, June 2011:
Index Formatted as Date-Time rather than String:
Chapter 4 - Case Study - Sunlight in Austin
Reading and cleaning the data - messy weather and climate data for Austin:
Statistical exploratory data analysis - slicing time series and the like:
Visual exploratory data analysis - histograms, line plots, box plots, and the like:
Example code includes:
myPath = "./PythonInputFiles/"
# Import pandas
import pandas as pd
# GREAT data is available at https://mesonet.agron.iastate.edu/request/download.phtml?network=TX_ASOS
# Downloaded KORD data from 2011 to myPath + "KAUS_2011_from_IAState.txt"
tmpAUS = pd.read_csv(myPath + "KAUS_2011_from_IAState.txt", header=5)
tmpAUS.columns = tmpAUS.columns.str.strip()
isMETAR = tmpAUS.loc[:, "valid"].str.contains(":53") # KAUS METAR are taken at xx:53
useAUS = tmpAUS.loc[isMETAR, :] # ends as 11,352 x 22, tons of duplicate METAR
useAUS = useAUS.drop_duplicates(subset=["valid"]) # ends as 8,432 x 22, some days with as few as 15 records
# First 5 rows are commented, the sixth row is the header, and the next 10,443 rows are the data
# Read in the data file: df
# df = pd.read_csv("data.csv")
df = useAUS.copy()
df["date"] = [x.split()[0] for x in df["valid"]]
df["time"] = [x.split()[1] for x in df["valid"]]
df["StationType"] = "Airport"
df["sky_condition"] = df["skyc1"] + df["skyc2"] + df["skyc3"] + df["skyc4"]
# Print the output of df.head()
print(df.head())
# This is the column_labels list (my data is different - modify)
# column_labels = "Wban,date,Time,StationType,sky_condition,sky_conditionFlag,visibility,visibilityFlag,wx_and_obst_to_vision,wx_and_obst_to_visionFlag,dry_bulb_faren,dry_bulb_farenFlag,dry_bulb_cel,dry_bulb_celFlag,wet_bulb_faren,wet_bulb_farenFlag,wet_bulb_cel,wet_bulb_celFlag,dew_point_faren,dew_point_farenFlag,dew_point_cel,dew_point_celFlag,relative_humidity,relative_humidityFlag,wind_speed,wind_speedFlag,wind_direction,wind_directionFlag,value_for_wind_character,value_for_wind_characterFlag,station_pressure,station_pressureFlag,pressure_tendency,pressure_tendencyFlag,presschange,presschangeFlag,sea_level_pressure,sea_level_pressureFlag,record_type,hourly_precip,hourly_precipFlag,altimeter,altimeterFlag,junk"
# list_to_drop = ['sky_conditionFlag', 'visibilityFlag', 'wx_and_obst_to_vision', 'wx_and_obst_to_visionFlag', 'dry_bulb_farenFlag', 'dry_bulb_celFlag', 'wet_bulb_farenFlag', 'wet_bulb_celFlag', 'dew_point_farenFlag', 'dew_point_celFlag', 'relative_humidityFlag', 'wind_speedFlag', 'wind_directionFlag', 'value_for_wind_character', 'value_for_wind_characterFlag', 'station_pressureFlag', 'pressure_tendencyFlag', 'pressure_tendency', 'presschange', 'presschangeFlag', 'sea_level_pressureFlag', 'hourly_precip', 'hourly_precipFlag', 'altimeter', 'record_type', 'altimeterFlag', 'junk']
# Desired variables to be kept
# final_keep = ["Wban", "StationType", "date", "Time", "dry_bulb_faren", "dew_point_faren", "wet_bulb_faren", "dry_bulb_cel", "dew_point_cel", "wet_bulb_cel", "sky_condition", "station_pressure", "sea_level_pressure", "relative humidity", "wind_direction", "wind_speed", "visibility"]
final_keep = ["Wban", "StationType", "date", "Time", "dry_bulb_faren", "dew_point_faren", "sky_condition", "station_pressure", "sea_level_pressure", "relative humidity", "wind_direction", "wind_speed", "visibility"]
# Remove the appropriate columns: df_dropped
# df_dropped = df.drop(list_to_drop, axis="columns")
df_dropped = df.iloc[:, [0, 24, 22, 23, 2, 3, 25, 8, 9, 4, 5, 6, 10]]
df_dropped.columns = final_keep
# Print the output of df_dropped.head()
print(df_dropped.head())
print(df_dropped.shape)
# Convert the date column to string: df_dropped['date']
# df_dropped['date'] = df_dropped["date"].astype(str)
# Pad leading zeros to the Time column: df_dropped['Time']
# df_dropped['Time'] = df_dropped['Time'].apply(lambda x:'{:0>4}'.format(x))
# Concatenate the new date and Time columns: date_string
date_string = df_dropped['date'] + " " + df_dropped['Time']
# Convert the date_string Series to datetime: date_times
date_times = pd.to_datetime(date_string, format='%Y-%m-%d %H:%M')
# Set the index to be the new date_times container: df_clean
df_clean = df_dropped.set_index(date_times)
# Eliminate straggler record with index in 2010
is2011 = df_clean.index.year == 2011
df_clean = df_clean.loc[is2011, :]
# Print the output of df_clean.head()
print(df_clean.head())
print(df_clean.shape)
# Print the dry_bulb_faren temperature between 8 AM and 9 AM on June 20, 2011
print(df_clean.loc["2011-06-20 08:00:00":"2011-06-20 09:00:00", "dry_bulb_faren"])
# Convert the dry_bulb_faren column to numeric values: df_clean['dry_bulb_faren']
df_clean['dry_bulb_faren'] = pd.to_numeric(df_clean['dry_bulb_faren'], errors="coerce")
# Print the transformed dry_bulb_faren temperature between 8 AM and 9 AM on June 20, 2011
print(df_clean.loc["2011-06-20 08:00:00":"2011-06-20 09:00:00", "dry_bulb_faren"])
# Convert the wind_speed and dew_point_faren columns to numeric values
df_clean['wind_speed'] = pd.to_numeric(df_clean['wind_speed'], errors="coerce")
df_clean['dew_point_faren'] = pd.to_numeric(df_clean['dew_point_faren'], errors="coerce")
df_clean['visibility'] = pd.to_numeric(df_clean['visibility'], errors="coerce")
# Print the median of the dry_bulb_faren column
print(df_clean["dry_bulb_faren"].median())
# Print the median of the dry_bulb_faren column for the time range '2011-Apr':'2011-Jun'
print(df_clean.loc["2011-04":"2011-06", 'dry_bulb_faren'].median())
# Print the median of the dry_bulb_faren column for the month of January
print(df_clean.loc["2011-01", 'dry_bulb_faren'].median())
# Downsample df_clean by day and aggregate by mean: daily_mean_2011
daily_mean_2011 = df_clean.resample("D").mean()
# Extract the dry_bulb_faren column from daily_mean_2011 using .values: daily_temp_2011
daily_temp_2011 = daily_mean_2011["dry_bulb_faren"].values
# NEED FILE!
# Downsample df_climate by day and aggregate by mean: daily_climate
# daily_climate = df_climate.resample("D").mean()
# Extract the Temperature column from daily_climate using .reset_index(): daily_temp_climate
# daily_temp_climate = daily_climate.reset_index()["Temperature"]
# Compute the difference between the two arrays and print the mean difference
# difference = daily_temp_2011 - daily_temp_climate
# print(difference.mean())
# Select days that are sunny: sunny
sunny = df_clean.loc[df_clean["sky_condition"].str.strip() == "CLR"]
# Select days that are overcast: overcast
overcast = df_clean.loc[df_clean["sky_condition"].str.contains("OVC")]
# Resample sunny and overcast, aggregating by maximum daily temperature
sunny_daily_max = sunny.resample("D").max()
overcast_daily_max = overcast.resample("D").max()
# Print the difference between the mean of sunny_daily_max and overcast_daily_max
print(sunny_daily_max.mean() - overcast_daily_max.mean())
# Import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
# Select the visibility and dry_bulb_faren columns and resample them: weekly_mean
weekly_mean = df_clean[["visibility", "dry_bulb_faren"]].resample("W").mean()
# Print the output of weekly_mean.corr()
print(weekly_mean.corr())
# Plot weekly_mean with subplots=True
weekly_mean.plot(subplots=True)
# plt.show()
plt.savefig("_dummyPy067.png", bbox_inches="tight")
plt.clf()
# Create a Boolean Series for sunny days: sunny
sunny = df_clean["sky_condition"].str.strip() == "CLR"
# Resample the Boolean Series by day and compute the sum: sunny_hours
sunny_hours = sunny.resample("D").sum()
# Resample the Boolean Series by day and compute the count: total_hours
total_hours = sunny.resample("D").count()
# Divide sunny_hours by total_hours: sunny_fraction
sunny_fraction = sunny_hours / total_hours
# Make a box plot of sunny_fraction
sunny_fraction.plot(kind="box")
# plt.show()
plt.savefig("_dummyPy068.png", bbox_inches="tight")
plt.clf()
# Resample dew_point_faren and dry_bulb_faren by Month, aggregating the maximum values: monthly_max
monthly_max = df_clean[['dew_point_faren', 'dry_bulb_faren']].resample("M").max()
# Generate a histogram with bins=8, alpha=0.5, subplots=True
monthly_max.plot(kind="hist", bins=8, alpha=0.5, subplots=True)
# Show the plot
# plt.show()
plt.savefig("_dummyPy069.png", bbox_inches="tight")
plt.clf()
# Recall that df_climate is a separate dataset of the 1981-2010 data
# NEED DATASET
# Extract the maximum temperature in August 2010 from df_climate: august_max
# august_max = df_climate.loc["2010-Aug", "Temperature"].max()
# print(august_max)
# Resample the August 2011 temperatures in df_clean by day and aggregate the maximum value: august_2011
# august_2011 = df_clean.loc["2011-Aug", "dry_bulb_faren"].resample("D").max()
# Filter out days in august_2011 where the value exceeded august_max: august_2011_high
# august_2011_high = august_2011.loc[august_2011 > august_max]
# Construct a CDF of august_2011_high
# august_2011_high.plot(kind="hist", bins=25, normed=True, cumulative=True)
# Display the plot
# plt.show()
## station valid tmpf dwpf relh drct sknt p01i alti \
## 0 AUS 2010-12-31 23:53 50.00 17.96 27.75 360.00 10.00 M 29.93
## 1 AUS 2011-01-01 00:53 51.08 15.08 23.54 360.00 13.00 M 29.95
## 2 AUS 2011-01-01 01:53 51.08 14.00 22.45 340.00 9.00 M 30.02
## 3 AUS 2011-01-01 02:53 51.08 12.92 21.41 10.00 13.00 M 30.02
## 4 AUS 2011-01-01 03:53 50.00 17.06 26.70 350.00 6.00 M 30.04
##
## mslp ... skyl1 skyl2 skyl3 skyl4 presentwx \
## 0 1013.20 ... 3900.00 M M M M
## 1 1014.20 ... 4500.00 M M M M
## 2 1016.20 ... 4900.00 M M M M
## 3 1016.20 ... 6000.00 M M M M
## 4 1017.00 ... 6500.00 M M M M
##
## metar date time \
## 0 KAUS 010553Z 36010KT 10SM BKN039 10/M08 A2993 ... 2010-12-31 23:53
## 1 KAUS 010653Z 36013KT 10SM OVC045 11/M09 A2995 ... 2011-01-01 00:53
## 2 KAUS 010753Z 34009KT 10SM OVC049 11/M10 A3002 ... 2011-01-01 01:53
## 3 KAUS 010853Z 01013KT 10SM OVC060 11/M11 A3002 ... 2011-01-01 02:53
## 4 KAUS 010953Z 35006KT 10SM OVC065 10/M08 A3004 ... 2011-01-01 03:53
##
## StationType sky_condition
## 0 Airport BKN
## 1 Airport OVC
## 2 Airport OVC
## 3 Airport OVC
## 4 Airport OVC
##
## [5 rows x 26 columns]
## Wban StationType date Time dry_bulb_faren dew_point_faren \
## 0 AUS Airport 2010-12-31 23:53 50.00 17.96
## 1 AUS Airport 2011-01-01 00:53 51.08 15.08
## 2 AUS Airport 2011-01-01 01:53 51.08 14.00
## 3 AUS Airport 2011-01-01 02:53 51.08 12.92
## 4 AUS Airport 2011-01-01 03:53 50.00 17.06
##
## sky_condition station_pressure sea_level_pressure relative humidity \
## 0 BKN 29.93 1013.20 27.75
## 1 OVC 29.95 1014.20 23.54
## 2 OVC 30.02 1016.20 22.45
## 3 OVC 30.02 1016.20 21.41
## 4 OVC 30.04 1017.00 26.70
##
## wind_direction wind_speed visibility
## 0 360.00 10.00 10.00
## 1 360.00 13.00 10.00
## 2 340.00 9.00 10.00
## 3 10.00 13.00 10.00
## 4 350.00 6.00 10.00
## (8432, 13)
## Wban StationType date Time dry_bulb_faren \
## 2011-01-01 00:53:00 AUS Airport 2011-01-01 00:53 51.08
## 2011-01-01 01:53:00 AUS Airport 2011-01-01 01:53 51.08
## 2011-01-01 02:53:00 AUS Airport 2011-01-01 02:53 51.08
## 2011-01-01 03:53:00 AUS Airport 2011-01-01 03:53 50.00
## 2011-01-01 04:53:00 AUS Airport 2011-01-01 04:53 50.00
##
## dew_point_faren sky_condition station_pressure \
## 2011-01-01 00:53:00 15.08 OVC 29.95
## 2011-01-01 01:53:00 14.00 OVC 30.02
## 2011-01-01 02:53:00 12.92 OVC 30.02
## 2011-01-01 03:53:00 17.06 OVC 30.04
## 2011-01-01 04:53:00 15.08 BKN 30.04
##
## sea_level_pressure relative humidity wind_direction \
## 2011-01-01 00:53:00 1014.20 23.54 360.00
## 2011-01-01 01:53:00 1016.20 22.45 340.00
## 2011-01-01 02:53:00 1016.20 21.41 10.00
## 2011-01-01 03:53:00 1017.00 26.70 350.00
## 2011-01-01 04:53:00 1017.20 24.50 20.00
##
## wind_speed visibility
## 2011-01-01 00:53:00 13.00 10.00
## 2011-01-01 01:53:00 9.00 10.00
## 2011-01-01 02:53:00 13.00 10.00
## 2011-01-01 03:53:00 6.00 10.00
## 2011-01-01 04:53:00 10.00 10.00
## (8431, 13)
## 2011-06-20 08:53:00 80.06
## Name: dry_bulb_faren, dtype: object
## 2011-06-20 08:53:00 80.06
## Name: dry_bulb_faren, dtype: float64
## 73.04
## 78.8
## 46.94
## dry_bulb_faren 6.827911
## dew_point_faren -3.915446
## station_pressure -0.002711
## wind_speed -2.321292
## visibility 0.174696
## dtype: float64
## visibility dry_bulb_faren
## visibility 1.000000 0.456775
## dry_bulb_faren 0.456775 1.000000
Mean Visibility and Temperature - Austin, TX 2011:
Percentage of Time with Clear Skies (CLR/SKC) by Day - Austin, TX 2011:
Histogram for Maximum Monthly Temperature and Dew Point - Austin, TX 2011:
Chapter 1 - Extracting and transforming data
Indexing DataFrames - multiple ways to extract data from the pandas DataFrame:
Slicing DataFrames - different return types that come from indexing a pandas DataFrame:
Filtering DataFrames - general tool for selecting part of the data based on its properties rather than its indices (typically by way of Booleans):
Transforming DataFrames - best practice is to use built-in pandas methods, and otherwise by universal numpy methods:
Example code includes:
myPath = "./PythonInputFiles/"
import pandas as pd
# NEED DATA FRAME election (67 x 8) - indexed by county with columns state (PA) - total - Obama - Romney - winner - voters - turnout - margin
# appears to be 2012 US general election data, with the Obama and Romney columns being percentages, total being total votes, and voters being registered voters
# Saved the DataCamp file to myPath + "PAElection_2012.csv"
electionPA = pd.read_csv(myPath + "PAElection_2012.csv", index_col="county")
election = electionPA.copy()
# Assign the row position of election.loc['Bedford']: x
x = 4
# Assign the column position of election['winner']: y
y = 4
# Print the boolean equivalence
print(election.iloc[x, y] == election.loc['Bedford', 'winner'])
# DO NOT RUN - downloaded to myPath + "PAElection2012.csv" instead
# filename = 'https://s3.amazonaws.com/assets.datacamp.com/production/course_1650/datasets/pennsylvania2012.csv'
# election = pd.read_csv(filename, index_col='county')
# Create a separate dataframe with the columns ['winner', 'total', 'voters']: results
results = election[['winner', 'total', 'voters']]
# Print the output of results.head()
print(results.head())
# Slice the columns from the starting column to 'Obama': left_columns
left_columns = election.loc[:, :"Obama"]
# Print the output of left_columns.head()
print(left_columns.head())
# Slice the columns from 'Obama' to 'winner': middle_columns
middle_columns = election.loc[:, "Obama":"winner"]
# Print the output of middle_columns.head()
print(middle_columns.head())
# Slice the columns from 'Romney' to the end: 'right_columns'
right_columns = election.loc[:, "Romney":]
# Print the output of right_columns.head()
print(right_columns.head())
# Create the list of row labels: rows
rows = ['Philadelphia', 'Centre', 'Fulton']
# Create the list of column labels: cols
cols = ['winner', 'Obama', 'Romney']
# Create the new DataFrame: three_counties
three_counties = election.loc[rows, cols]
# Print the three_counties DataFrame
print(three_counties)
# Create a turnout category
election["turnout"] = 100 * election["total"] / election["voters"]
# Create the boolean array: high_turnout
high_turnout = election["turnout"] > 70
# Filter the election DataFrame with the high_turnout array: high_turnout_df
high_turnout_df = election[high_turnout]
# Print the high_turnout_results DataFrame
print(high_turnout_df)
# Import numpy
import numpy as np
# Create the election["margin"] column
election["margin"] = abs(election["Obama"] - election["Romney"])
# Create the boolean array: too_close
too_close = election["margin"] < 1
# Assign np.nan to the 'winner' column where the results were too close to call
election["winner"][too_close] = np.nan
# Print the output of election.info()
print(election.info())
# NEED DATASET titanic (1309 x 14)
# User version saved previously
titanic = pd.read_csv(myPath + 'titanic.csv', index_col=0)
# Select the 'age' and 'cabin' columns: df
df = titanic[["Age", "Cabin"]]
# Print the shape of df
print(df.shape)
# Drop rows in df with how='any' and print the shape
print(df.dropna(how="any").shape)
# Drop rows in df with how='all' and print the shape
print(df.dropna(how="all").shape)
# Call .dropna() with thresh=1000 and axis='columns' and print the output of .info() from titanic
print(titanic.dropna(thresh=500, axis='columns').info())
# NEED DATASET weather which is 365 x 23 from Weather Underground, representing Pittsburgh weather data for 2013
# https://www.wunderground.com/history
# Use the KORD METAR data instead
# Load the file
tmpORD = pd.read_csv(myPath + "KORD_2010_from_IAState.txt", header=5)
tmpORD.columns = tmpORD.columns.str.strip()
isMETAR = tmpORD.loc[:, "valid"].str.contains(":51") # KORD METAR are taken at xx:51
useORD = tmpORD.loc[isMETAR, :] # ends as 8709 x 22, probably the METAR check missed a few at "off" times
date_list = useORD["valid"]
time_format = '%Y-%m-%d %H:%M'
my_datetimes = pd.to_datetime(date_list, format=time_format)
useORD.index = my_datetimes
# Just keep the temperature and dew point
weather = useORD[["tmpf", "dwpf"]]
weather.columns = ['Mean TemperatureF','Mean Dew PointF']
# Write a function to convert degrees Fahrenheit to degrees Celsius: to_celsius
def to_celsius(F):
return 5/9*(F - 32)
# Apply the function over 'Mean TemperatureF' and 'Mean Dew PointF': df_celsius
df_celsius = weather[['Mean TemperatureF','Mean Dew PointF']].apply(to_celsius)
# Reassign the columns df_celsius
df_celsius.columns = ['Mean TemperatureC', 'Mean Dew PointC']
# Print the output of df_celsius.head()
print(df_celsius.head())
# Create the dictionary: red_vs_blue
red_vs_blue = {"Obama":"blue", "Romney":"red"}
# Use the dictionary to map the 'winner' column to the new column: election['color']
election['color'] = election["winner"].map(red_vs_blue)
# Print the output of election.head()
print(election.head())
# Import zscore from scipy.stats
# Need to solve BLAS/LAPACK issue - cannot get scipy to download and install . . .
# from scipy.stats import zscore
import numpy as np
def zscore(x):
mu = np.mean(x)
sd = np.std(x)
return((x - mu) / sd)
# Call zscore with election['turnout'] as input: turnout_zscore
turnout_zscore = zscore(election["turnout"])
# Print the type of turnout_zscore
print(type(turnout_zscore))
# Assign turnout_zscore to a new column: election['turnout_zscore']
election["turnout_zscore"] = turnout_zscore
# Print the output of election.head()
print(election.head())
## -c:90: SettingWithCopyWarning:
## A value is trying to be set on a copy of a slice from a DataFrame
##
## See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
## True
## winner total voters
## county
## Adams Romney 41973 61156
## Allegheny Obama 614671 924351
## Armstrong Romney 28322 42147
## Beaver Romney 80015 115157
## Bedford Romney 21444 32189
## state total Obama
## county
## Adams PA 41973 35.482334
## Allegheny PA 614671 56.640219
## Armstrong PA 28322 30.696985
## Beaver PA 80015 46.032619
## Bedford PA 21444 22.057452
## Obama Romney winner
## county
## Adams 35.482334 63.112001 Romney
## Allegheny 56.640219 42.185820 Obama
## Armstrong 30.696985 67.901278 Romney
## Beaver 46.032619 52.637630 Romney
## Bedford 22.057452 76.986570 Romney
## Romney winner voters
## county
## Adams 63.112001 Romney 61156
## Allegheny 42.185820 Obama 924351
## Armstrong 67.901278 Romney 42147
## Beaver 52.637630 Romney 115157
## Bedford 76.986570 Romney 32189
## winner Obama Romney
## county
## Philadelphia Obama 85.224251 14.051451
## Centre Romney 48.948416 48.977486
## Fulton Romney 21.096291 77.748861
## state total Obama Romney winner voters turnout
## county
## Bucks PA 319407 49.966970 48.801686 Obama 435606 73.324748
## Butler PA 88924 31.920516 66.816607 Romney 122762 72.436096
## Chester PA 248295 49.228539 49.650617 Romney 337822 73.498766
## Forest PA 2308 38.734835 59.835355 Romney 3232 71.410891
## Franklin PA 62802 30.110506 68.583803 Romney 87406 71.850903
## Montgomery PA 401787 56.637223 42.286834 Obama 551105 72.905708
## Westmoreland PA 168709 37.567646 61.306154 Romney 238006 70.884347
## <class 'pandas.core.frame.DataFrame'>
## Index: 67 entries, Adams to York
## Data columns (total 8 columns):
## state 67 non-null object
## total 67 non-null int64
## Obama 67 non-null float64
## Romney 67 non-null float64
## winner 64 non-null object
## voters 67 non-null int64
## turnout 67 non-null float64
## margin 67 non-null float64
## dtypes: float64(4), int64(2), object(2)
## memory usage: 5.4+ KB
## None
## (891, 2)
## (185, 2)
## (733, 2)
## <class 'pandas.core.frame.DataFrame'>
## Int64Index: 891 entries, 1 to 891
## Data columns (total 11 columns):
## PassengerId 891 non-null int64
## Survived 891 non-null int64
## Pclass 891 non-null int64
## Name 891 non-null object
## Sex 891 non-null object
## Age 714 non-null float64
## SibSp 891 non-null int64
## Parch 891 non-null int64
## Ticket 891 non-null object
## Fare 891 non-null float64
## Embarked 889 non-null object
## dtypes: float64(2), int64(5), object(4)
## memory usage: 69.6+ KB
## None
## Mean TemperatureC Mean Dew PointC
## valid
## 2010-01-01 00:51:00 -9.4 -16.1
## 2010-01-01 01:51:00 -10.0 -16.1
## 2010-01-01 02:51:00 -11.1 -16.1
## 2010-01-01 03:51:00 -11.7 -16.7
## 2010-01-01 04:51:00 -12.2 -16.7
## state total Obama Romney winner voters turnout \
## county
## Adams PA 41973 35.482334 63.112001 Romney 61156 68.632677
## Allegheny PA 614671 56.640219 42.185820 Obama 924351 66.497575
## Armstrong PA 28322 30.696985 67.901278 Romney 42147 67.198140
## Beaver PA 80015 46.032619 52.637630 Romney 115157 69.483401
## Bedford PA 21444 22.057452 76.986570 Romney 32189 66.619031
##
## margin color
## county
## Adams 27.629667 red
## Allegheny 14.454399 blue
## Armstrong 37.204293 red
## Beaver 6.605012 red
## Bedford 54.929118 red
## <class 'pandas.core.series.Series'>
## state total Obama Romney winner voters turnout \
## county
## Adams PA 41973 35.482334 63.112001 Romney 61156 68.632677
## Allegheny PA 614671 56.640219 42.185820 Obama 924351 66.497575
## Armstrong PA 28322 30.696985 67.901278 Romney 42147 67.198140
## Beaver PA 80015 46.032619 52.637630 Romney 115157 69.483401
## Bedford PA 21444 22.057452 76.986570 Romney 32189 66.619031
##
## margin color turnout_zscore
## county
## Adams 27.629667 red 0.853734
## Allegheny 14.454399 blue 0.439846
## Armstrong 37.204293 red 0.575650
## Beaver 6.605012 red 1.018647
## Bedford 54.929118 red 0.463391
Chapter 2 - Advanced Indexing
Index objects and labeled data - one of the key building blocks of the pandas Data Structures:
Hierarchical indexing - representing multi-dimensional index data:
Example code includes:
myPath = "./PythonInputFiles/"
import pandas as pd
import numpy as np
sales = pd.DataFrame()
sales["eggs"] = [47, 110, 221, 77, 132, 205]
sales["salt"] = [12, 50, 89, 87, np.nan, 60]
sales["spam"] = [17, 31, 72, 20, 52, 55]
sales.index = ["jan", "feb", "mar", "apr", "may", "jun"]
# Create the list of new indexes: new_idx
new_idx = [x.upper() for x in sales.index]
# Assign new_idx to sales.index
sales.index = new_idx
# Print the sales DataFrame
print(sales)
# Assign the string 'MONTHS' to sales.index.name
sales.index.name = "MONTHS"
# Print the sales DataFrame
print(sales)
# Assign the string 'PRODUCTS' to sales.columns.name
sales.columns.name = "PRODUCTS"
# Print the sales dataframe again
print(sales)
# Generate the list of months: months
months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun']
# Assign months to sales.index
sales.index = months
# Print the modified sales DataFrame
print(sales)
# NEED TO MODIFY sales so it is the same data but indexed as CA/1, CA/2, NY/1, NY/2, TX/1, TX/2 (using state-month)
sales = sales.set_index([["CA", "CA", "NY", "NY","TX", "TX"], [1, 2, 1, 2, 1, 2]])
# Print sales.loc[['CA', 'TX']]
print(sales.loc[['CA', 'TX']])
# Print sales['CA':'TX']
print(sales['CA':'TX'])
# Now, sales is again a non-indexed DataFrame with sate-month as columns
# Set the index to be the columns ['state', 'month']: sales
states = [x for x, y in list(sales.index)]
months = [y for x, y in list(sales.index)]
sales.index = range(sales.shape[0])
sales["state"] = states
sales["month"] = months
oldSales = sales.copy()
sales = sales.set_index(['state', 'month'])
# Sort the MultiIndex: sales
sales = sales.sort_index(ascending=False)
# Print the sales DataFrame
print(sales)
multiSales = sales.copy()
# Go back to the sales as it was prior to indexing in the above step
# Set the index to the column 'state': sales
sales = oldSales.set_index(["state"])
# Print the sales DataFrame
print(sales)
# Access the data from 'NY'
print(sales.loc["NY"])
# Go back to sales as the Multi-Index dataset again . . .
sales = multiSales.copy()
sales = sales.sort_index(ascending=True) # Could not grab witout error unless ascending=True
# Look up data for NY in month 1: NY_month1
NY_month1 = sales.loc[ ("NY", 1) ]
# Look up data for CA and TX in month 2: CA_TX_month2
CA_TX_month2 = sales.loc[ (["CA", "TX"], 2) , :]
# Look up data for all states in month 2: all_month2
all_month2 = sales.loc[ (slice(None), 2), :]
## eggs salt spam
## JAN 47 12.0 17
## FEB 110 50.0 31
## MAR 221 89.0 72
## APR 77 87.0 20
## MAY 132 NaN 52
## JUN 205 60.0 55
## eggs salt spam
## MONTHS
## JAN 47 12.0 17
## FEB 110 50.0 31
## MAR 221 89.0 72
## APR 77 87.0 20
## MAY 132 NaN 52
## JUN 205 60.0 55
## PRODUCTS eggs salt spam
## MONTHS
## JAN 47 12.0 17
## FEB 110 50.0 31
## MAR 221 89.0 72
## APR 77 87.0 20
## MAY 132 NaN 52
## JUN 205 60.0 55
## PRODUCTS eggs salt spam
## Jan 47 12.0 17
## Feb 110 50.0 31
## Mar 221 89.0 72
## Apr 77 87.0 20
## May 132 NaN 52
## Jun 205 60.0 55
## PRODUCTS eggs salt spam
## CA 1 47 12.0 17
## 2 110 50.0 31
## TX 1 132 NaN 52
## 2 205 60.0 55
## PRODUCTS eggs salt spam
## CA 1 47 12.0 17
## 2 110 50.0 31
## NY 1 221 89.0 72
## 2 77 87.0 20
## TX 1 132 NaN 52
## 2 205 60.0 55
## PRODUCTS eggs salt spam
## state month
## TX 2 205 60.0 55
## 1 132 NaN 52
## NY 2 77 87.0 20
## 1 221 89.0 72
## CA 2 110 50.0 31
## 1 47 12.0 17
## PRODUCTS eggs salt spam month
## state
## CA 47 12.0 17 1
## CA 110 50.0 31 2
## NY 221 89.0 72 1
## NY 77 87.0 20 2
## TX 132 NaN 52 1
## TX 205 60.0 55 2
## PRODUCTS eggs salt spam month
## state
## NY 221 89.0 72 1
## NY 77 87.0 20 2
Chapter 3 - Rearranging and Reshaping Data
Pivoting DataFrames - changing shapes to one that better suits analysis needs:
Stacking and unstaking DataFrames - the idea of moving variables to/from the index so that the columns match data needs:
Melting DataFrames - converting pivoted data back in to a column format:
Pivot tables are needed when there are multiple rows with the same index (if pivoted) - need to specify how to manage the duplicates:
Example code includes:
myPath = "./PythonInputFiles/"
import pandas as pd
users=pd.DataFrame()
users["weekday"] = ["Sun", "Sun", "Mon", "Mon"]
users["city"] = ["Austin", "Dallas", "Austin", "Dallas"]
users["visitors"] = [139, 237, 326, 456]
users["signups"] = [7, 12, 3, 5]
# Pivot the users DataFrame: visitors_pivot
visitors_pivot = users.pivot(index="weekday", columns="city", values="visitors")
# Print the pivoted DataFrame
print(visitors_pivot)
# Pivot users with signups indexed by weekday and city: signups_pivot
signups_pivot = users.pivot(index="weekday", columns="city", values="signups")
# Print signups_pivot
print(signups_pivot)
# Pivot users pivoted by both signups and visitors: pivot
pivot = users.pivot(index="weekday", columns="city")
# Print the pivoted DataFrame
print(pivot)
a = users.set_index(["city", "weekday"])
users = a.sort_index()
# Unstack users by 'weekday': byweekday
byweekday = users.unstack(level="weekday")
# Print the byweekday DataFrame
print(byweekday)
# Stack byweekday by 'weekday' and print it
print(byweekday.stack(level="weekday"))
# Unstack users by 'city': bycity
bycity = users.unstack(level="city")
# Print the bycity DataFrame
print(bycity)
# Stack bycity by 'city' and print it
print(bycity.stack(level="city"))
# Stack 'city' back into the index of bycity: newusers
newusers = bycity.stack(level="city")
# Swap the levels of the index of newusers: newusers
newusers = newusers.swaplevel(0, 1)
# Print newusers and verify that the index is not sorted
print(newusers)
# Sort the index of newusers: newusers
newusers = newusers.sort_index()
# Print newusers and verify that the index is now sorted
print(newusers)
# Verify that the new DataFrame is equal to the original
print(newusers.equals(users))
visitors_by_city_weekday = users[["visitors"]].unstack(level="city").reset_index()
visitors_by_city_weekday.columns = ["weekday", "Austin", "Dallas"]
# Reset the index: visitors_by_city_weekday
# visitors_by_city_weekday = visitors_by_city_weekday.reset_index() # this needed to be done above to get the column names right . . .
# Print visitors_by_city_weekday
print(visitors_by_city_weekday)
# Melt visitors_by_city_weekday: visitors
visitors = pd.melt(visitors_by_city_weekday, id_vars=["weekday"], value_name="visitors", var_name="city")
# Print visitors
print(visitors)
users=pd.DataFrame()
users["weekday"] = ["Sun", "Sun", "Mon", "Mon"]
users["city"] = ["Austin", "Dallas", "Austin", "Dallas"]
users["visitors"] = [139, 237, 326, 456]
users["signups"] = [7, 12, 3, 5]
# Melt users: skinny
skinny = pd.melt(users, id_vars = ["weekday", "city"], value_vars=["visitors", "signups"])
# Print skinny
print(skinny)
# Set the new index: users_idx
users_idx = users.set_index(['city', 'weekday'])
# Print the users_idx DataFrame
print(users_idx)
# Obtain the key-value pairs: kv_pairs
kv_pairs = pd.melt(users_idx, col_level=0)
# Print the key-value pairs
print(kv_pairs)
# Create the DataFrame with the appropriate pivot table: by_city_day
by_city_day = users.pivot_table(index="weekday", columns="city")
# Print by_city_day
print(by_city_day)
# Use a pivot table to display the count of each column: count_by_weekday1
count_by_weekday1 = users.pivot_table(index="weekday", aggfunc="count")
# Print count_by_weekday
print(count_by_weekday1)
# Replace 'aggfunc='count'' with 'aggfunc=len': count_by_weekday2
count_by_weekday2 = users.pivot_table(index="weekday", aggfunc=len)
# Verify that the same result is obtained
print('==========================================')
print(count_by_weekday1.equals(count_by_weekday2))
# Create the DataFrame with the appropriate pivot table: signups_and_visitors
signups_and_visitors = users.pivot_table(index="weekday", aggfunc=sum)
# Print signups_and_visitors
print(signups_and_visitors)
# Add in the margins: signups_and_visitors_total
signups_and_visitors_total = users.pivot_table(index="weekday", aggfunc=sum, margins=True)
# Print signups_and_visitors_total
print(signups_and_visitors_total)
## city Austin Dallas
## weekday
## Mon 326 456
## Sun 139 237
## city Austin Dallas
## weekday
## Mon 3 5
## Sun 7 12
## visitors signups
## city Austin Dallas Austin Dallas
## weekday
## Mon 326 456 3 5
## Sun 139 237 7 12
## visitors signups
## weekday Mon Sun Mon Sun
## city
## Austin 326 139 3 7
## Dallas 456 237 5 12
## visitors signups
## city weekday
## Austin Mon 326 3
## Sun 139 7
## Dallas Mon 456 5
## Sun 237 12
## visitors signups
## city Austin Dallas Austin Dallas
## weekday
## Mon 326 456 3 5
## Sun 139 237 7 12
## visitors signups
## weekday city
## Mon Austin 326 3
## Dallas 456 5
## Sun Austin 139 7
## Dallas 237 12
## visitors signups
## city weekday
## Austin Mon 326 3
## Dallas Mon 456 5
## Austin Sun 139 7
## Dallas Sun 237 12
## visitors signups
## city weekday
## Austin Mon 326 3
## Sun 139 7
## Dallas Mon 456 5
## Sun 237 12
## True
## weekday Austin Dallas
## 0 Mon 326 456
## 1 Sun 139 237
## weekday city visitors
## 0 Mon Austin 326
## 1 Sun Austin 139
## 2 Mon Dallas 456
## 3 Sun Dallas 237
## weekday city variable value
## 0 Sun Austin visitors 139
## 1 Sun Dallas visitors 237
## 2 Mon Austin visitors 326
## 3 Mon Dallas visitors 456
## 4 Sun Austin signups 7
## 5 Sun Dallas signups 12
## 6 Mon Austin signups 3
## 7 Mon Dallas signups 5
## visitors signups
## city weekday
## Austin Sun 139 7
## Dallas Sun 237 12
## Austin Mon 326 3
## Dallas Mon 456 5
## variable value
## 0 visitors 139
## 1 visitors 237
## 2 visitors 326
## 3 visitors 456
## 4 signups 7
## 5 signups 12
## 6 signups 3
## 7 signups 5
## signups visitors
## city Austin Dallas Austin Dallas
## weekday
## Mon 3 5 326 456
## Sun 7 12 139 237
## city signups visitors
## weekday
## Mon 2 2 2
## Sun 2 2 2
## ==========================================
## True
## signups visitors
## weekday
## Mon 8 782
## Sun 19 376
## signups visitors
## weekday
## Mon 8.0 782.0
## Sun 19.0 376.0
## All 27.0 1158.0
Chapter 4 - Grouping data
Categoricals and groupby - using the .groupby() method and then chaining various commands to it:
Groupby and aggregation - running mutlipe calculations after the split and before the combine:
Groupby and transformation - applying different transformations to different groups:
Groupby and filtering - filtering groups prior to aggregating:
Example code includes:
myPath = "./PythonInputFiles/"
# Need to bring in "titanic" (1309 x 14)
import pandas as pd
titanic = pd.read_csv(myPath + 'titanic.csv', index_col=0)
titanic.columns = ['id', 'survived', 'pclass', 'name', 'sex', 'age', 'sibsp', 'parch', 'ticket', 'fare', 'cabin', 'embarked']
# titanic.columns = ['pclass', 'survived', 'name', 'sex', 'age', 'sibsp', 'parch', 'ticket', 'fare', 'cabin', 'embarked', 'boat', 'body', 'home.dest']
# Group titanic by 'pclass'
by_class = titanic.groupby("pclass")
# Aggregate 'survived' column of by_class by count
count_by_class = by_class["survived"].count()
# Print count_by_class
print(count_by_class)
# Group titanic by 'embarked' and 'pclass'
by_mult = titanic.groupby(["embarked", "pclass"])
# Aggregate 'survived' column of by_mult by count
count_mult = by_mult["survived"].count()
# Print count_mult
print(count_mult)
# Saved to myPath as lifeSaved.csv and regionsSaved.csv
# life_f = 'https://s3.amazonaws.com/assets.datacamp.com/production/course_1650/datasets/life_expectancy.csv'
# regions_f = 'https://s3.amazonaws.com/assets.datacamp.com/production/course_1650/datasets/regions.csv'
life = pd.read_csv(myPath + "lifeSaved.csv", index_col='Country', encoding="latin-1")
regions = pd.read_csv(myPath + "regionsSaved.csv", index_col='Country', encoding="latin-1")
# Group life by regions['region']: life_by_region
life_by_region = life.groupby(regions["region"])
# Print the mean over the '2010' column of life_by_region
print(life_by_region["2010"].mean())
# Again using the titanic dataset (same as above)
# Group titanic by 'pclass': by_class
by_class = titanic.groupby("pclass")
# Select 'age' and 'fare'
by_class_sub = by_class[['age','fare']]
# Aggregate by_class_sub by 'max' and 'median': aggregated
aggregated = by_class_sub.agg(["max", "median"])
# Print the maximum age in each class
print(aggregated.loc[:, ('age','max')])
# Print the median fare in each class
print(aggregated.loc[:, ('fare', 'median')])
# Read the CSV file into a DataFrame and sort the index: gapminder
# NEED FILE!
# gapminder = pd.read_csv("gapminder.csv", index_col=['Year','region','Country']).sort_index()
# Group gapminder by 'Year' and 'region': by_year_region
# by_year_region = gapminder.groupby(level=["Year", "region"])
# Define the function to compute spread: spread
# def spread(series):
# return series.max() - series.min()
# Create the dictionary: aggregator
# aggregator = {'population':'sum', 'child_mortality':'mean', 'gdp':spread}
# Aggregate by_year_region using the dictionary: aggregated
# aggregated = by_year_region.agg(aggregator)
# Print the last 6 entries of aggregated
# print(aggregated.tail(6))
# NEED FILE
# Read file: sales
# sales = pd.read_csv("sales.csv", index_col="Date", parse_dates=True)
# Create a groupby object: by_day
# by_day = sales.groupby(sales.index.strftime('%a'))
# Create sum: units_sum
# units_sum = by_day.sum()
# Print units_sum
# print(units_sum)
# Import zscore
# from scipy.stats import zscore
# Group gapminder_2010: standardized
# standardized = gapminder_2010.groupby("region")[['life','fertility']].transform(zscore)
# Construct a Boolean Series to identify outliers: outliers
# outliers = (standardized['life'] < -3) | (standardized['fertility'] > 3)
# Filter gapminder_2010 by the outliers: gm_outliers
# gm_outliers = gapminder_2010.loc[outliers]
# Print gm_outliers
# print(gm_outliers)
# Create a groupby object: by_sex_class
by_sex_class = titanic.groupby(["sex", "pclass"])
# Write a function that imputes median
def impute_median(series):
return series.fillna(series.median())
# Impute age and assign to titanic['age']
titanic.age = by_sex_class["age"].transform(impute_median)
# Print the output of titanic.tail(10)
print(titanic.tail(10))
def disparity(gr):
# Compute the spread of gr['gdp']: s
s = gr['gdp'].max() - gr['gdp'].min()
# Compute the z-score of gr['gdp'] as (gr['gdp']-gr['gdp'].mean())/gr['gdp'].std(): z
z = (gr['gdp'] - gr['gdp'].mean())/gr['gdp'].std()
# Return a DataFrame with the inputs {'z(gdp)':z, 'regional spread(gdp)':s}
return pd.DataFrame({'z(gdp)':z , 'regional spread(gdp)':s})
# NEED FILE!
# Group gapminder_2010 by 'region': regional
# regional = gapminder_2010.groupby("region")
# Apply the disparity function on regional: reg_disp
# reg_disp = regional.apply(disparity)
# Print the disparity of 'United States', 'United Kingdom', and 'China'
# print(reg_disp.loc[['United States','United Kingdom','China'], :])
def c_deck_survival(gr):
c_passengers = gr['cabin'].str.startswith('C').fillna(False)
return gr.loc[c_passengers, 'survived'].mean()
# Create a groupby object using titanic over the 'sex' column: by_sex
by_sex = titanic.groupby("sex")
# Call by_sex.apply with the function c_deck_survival and print the result
c_surv_by_sex = by_sex.apply(c_deck_survival)
# Print the survival rates
print(c_surv_by_sex)
# NEED FILE!
# Read the CSV file into a DataFrame: sales
# sales = pd.read_csv('sales.csv', index_col='Date', parse_dates=True)
# Group sales by 'Company': by_company
# by_company = sales.groupby("Company")
# Compute the sum of the 'Units' of by_company: by_com_sum
# by_com_sum = by_company["Units"].sum()
# print(by_com_sum)
# Filter 'Units' where the sum is > 35: by_com_filt
# by_com_filt = by_company.filter(lambda g:g['Units'].sum() > 35)
# print(by_com_filt)
# Create the Boolean Series: under10
under10 = (titanic['age'] < 10).map({True:'under 10', False:'over 10'})
# Group by under10 and compute the survival rate
survived_mean_1 = titanic.groupby(under10)["survived"].mean()
print(survived_mean_1)
# Group by under10 and pclass and compute the survival rate
survived_mean_2 = titanic.groupby([under10, "pclass"])["survived"].mean()
print(survived_mean_2)
## pclass
## 1 216
## 2 184
## 3 491
## Name: survived, dtype: int64
## embarked pclass
## C 1 85
## 2 17
## 3 66
## Q 1 2
## 2 3
## 3 72
## S 1 127
## 2 164
## 3 353
## Name: survived, dtype: int64
## region
## America 74.037350
## East Asia & Pacific 73.405750
## Europe & Central Asia 75.656387
## Middle East & North Africa 72.805333
## South Asia 68.189750
## Sub-Saharan Africa 57.575080
## Name: 2010, dtype: float64
## pclass
## 1 80.0
## 2 70.0
## 3 74.0
## Name: (age, max), dtype: float64
## pclass
## 1 60.2875
## 2 14.2500
## 3 8.0500
## Name: (fare, median), dtype: float64
## id survived pclass name sex \
## 882 882 0 3 Markun, Mr. Johann male
## 883 883 0 3 Dahlberg, Miss. Gerda Ulrika female
## 884 884 0 2 Banfield, Mr. Frederick James male
## 885 885 0 3 Sutehall, Mr. Henry Jr male
## 886 886 0 3 Rice, Mrs. William (Margaret Norton) female
## 887 887 0 2 Montvila, Rev. Juozas male
## 888 888 1 1 Graham, Miss. Margaret Edith female
## 889 889 0 3 Johnston, Miss. Catherine Helen "Carrie" female
## 890 890 1 1 Behr, Mr. Karl Howell male
## 891 891 0 3 Dooley, Mr. Patrick male
##
## age sibsp parch ticket fare cabin embarked
## 882 33.0 0 0 349257 7.8958 NaN S
## 883 22.0 0 0 7552 10.5167 NaN S
## 884 28.0 0 0 C.A./SOTON 34068 10.5000 NaN S
## 885 25.0 0 0 SOTON/OQ 392076 7.0500 NaN S
## 886 39.0 0 5 382652 29.1250 NaN Q
## 887 27.0 0 0 211536 13.0000 NaN S
## 888 19.0 0 0 112053 30.0000 B42 S
## 889 21.5 1 2 W./C. 6607 23.4500 NaN S
## 890 26.0 0 0 111369 30.0000 C148 C
## 891 32.0 0 0 370376 7.7500 NaN Q
## sex
## female 0.888889
## male 0.343750
## dtype: float64
## age
## over 10 0.366707
## under 10 0.612903
## Name: survived, dtype: float64
## age pclass
## over 10 1 0.629108
## 2 0.419162
## 3 0.222717
## under 10 1 0.666667
## 2 1.000000
## 3 0.452381
## Name: survived, dtype: float64
Chapter 5 - Case Study (Summer Olympics)
Introduction to the Summer Olympics data and analysis objectives:
Understanding the column labels - looking at the Gender and event_gender columns to understand how they are different:
Constructing alternative country rankings:
Reshaping DataFrames for visualization:
Example code includes:
myPath = "./PythonInputFiles/"
import pandas as pd
import matplotlib.pyplot as plt
# Data is from https://www.theguardian.com/sport/datablog/2012/jun/25/olympic-medal-winner-list-data
# medals is 29216x10 with ['City', 'Edition', 'Sport', 'Discipline', 'Athlete', 'NOC', 'Gender', 'Event', 'Event_gender', 'Medal']
# Downloaded file from Guardian as myPath + "summerOlympics_Medalists_1896_2008.csv" - read file in
medals = pd.read_csv(myPath + "summerOlympics_Medalists_1896_2008.csv", header=4)
USA_edition_grouped = medals.loc[medals.NOC == 'USA'].groupby('Edition')
# Select the 'NOC' column of medals: country_names
country_names = medals["NOC"]
# Count the number of medals won by each country: medal_counts
medal_counts = country_names.value_counts()
# Print top 15 countries ranked by medals
print(medal_counts.head(15))
# Construct the pivot table: counted
counted = medals.pivot_table(index="NOC", columns="Medal", values="Athlete", aggfunc="count")
# Create the new column: counted['totals']
counted['totals'] = counted.sum(axis="columns")
# Sort counted by the 'totals' column
counted = counted.sort_values("totals", ascending=False)
# Print the top 15 rows of counted
print(counted.head(15))
# Select columns: ev_gen
ev_gen = medals[["Event_gender", "Gender"]]
# Drop duplicate pairs: ev_gen_uniques
ev_gen_uniques = ev_gen.drop_duplicates()
# Print ev_gen_uniques
print(ev_gen_uniques)
# Group medals by the two columns: medals_by_gender
medals_by_gender = medals.groupby(['Event_gender', 'Gender'])
# Create a DataFrame with a group count: medal_count_by_gender
medal_count_by_gender = medals_by_gender.count()
# Print medal_count_by_gender
print(medal_count_by_gender)
# Create the Boolean Series: sus
sus = (medals.Event_gender == 'W') & (medals.Gender == 'Men')
# Create a DataFrame with the suspicious row: suspect
suspect = medals.loc[sus, :]
# Print suspect
print(suspect)
# Group medals by 'NOC': country_grouped
country_grouped = medals.groupby("NOC")
# Compute the number of distinct sports in which each country won medals: Nsports
Nsports = country_grouped["Sport"].nunique()
# Sort the values of Nsports in descending order
Nsports = Nsports.sort_values(ascending=False)
# Print the top 15 rows of Nsports
print(Nsports.head(15))
# Extract all rows for which the 'Edition' is between 1952 & 1988: during_cold_war
during_cold_war = (medals["Edition"] >= 1952) & (medals["Edition"] <= 1988)
# Extract rows for which 'NOC' is either 'USA' or 'URS': is_usa_urs
is_usa_urs = medals.NOC.isin(["USA", "URS"])
# Use during_cold_war and is_usa_urs to create the DataFrame: cold_war_medals
cold_war_medals = medals.loc[during_cold_war & is_usa_urs]
# Group cold_war_medals by 'NOC'
country_grouped = cold_war_medals.groupby("NOC")
# Create Nsports
Nsports = country_grouped["Sport"].nunique().sort_values(ascending=False)
# Print Nsports
print(Nsports)
# Create the pivot table: medals_won_by_country
medals_won_by_country = medals.pivot_table(index="Edition", columns="NOC", values="Athlete", aggfunc="count")
# Slice medals_won_by_country: cold_war_usa_usr_medals
cold_war_usa_usr_medals = medals_won_by_country.loc[1952:1988, ["USA", "URS"]]
# Create most_medals
most_medals = cold_war_usa_usr_medals.idxmax(axis="columns")
# Print most_medals.value_counts()
print(most_medals.value_counts())
# Create the DataFrame: usa
usa = medals.loc[medals["NOC"] == "USA"]
# Group usa by ['Edition', 'Medal'] and aggregate over 'Athlete'
usa_medals_by_year = usa.groupby(['Edition', 'Medal'])["Athlete"].count()
# Reshape usa_medals_by_year by unstacking
usa_medals_by_year = usa_medals_by_year.unstack(level="Medal")
# Plot the DataFrame usa_medals_by_year
usa_medals_by_year.plot()
# plt.show()
plt.savefig("_dummyPy070.png", bbox_inches="tight")
plt.clf()
# Create the DataFrame: usa
usa = medals[medals.NOC == 'USA']
# Group usa by 'Edition', 'Medal', and 'Athlete'
usa_medals_by_year = usa.groupby(['Edition', 'Medal'])['Athlete'].count()
# Reshape usa_medals_by_year by unstacking
usa_medals_by_year = usa_medals_by_year.unstack(level='Medal')
# Create an area plot of usa_medals_by_year
usa_medals_by_year.plot.area()
# plt.show()
plt.savefig("_dummyPy071.png", bbox_inches="tight")
plt.clf()
# Redefine 'Medal' as an ordered categorical
medals.Medal = pd.Categorical(values=medals.Medal, categories=['Bronze', 'Silver', 'Gold'], ordered=True)
# Create the DataFrame: usa
usa = medals[medals.NOC == 'USA']
# Group usa by 'Edition', 'Medal', and 'Athlete'
usa_medals_by_year = usa.groupby(['Edition', 'Medal'])['Athlete'].count()
# Reshape usa_medals_by_year by unstacking
usa_medals_by_year = usa_medals_by_year.unstack(level='Medal')
# Create an area plot of usa_medals_by_year
usa_medals_by_year.plot.area()
# plt.show()
plt.savefig("_dummyPy072.png", bbox_inches="tight")
plt.clf()
## USA 4335
## URS 2049
## GBR 1594
## FRA 1314
## ITA 1228
## GER 1211
## AUS 1075
## HUN 1053
## SWE 1021
## GDR 825
## NED 782
## JPN 704
## CHN 679
## RUS 638
## ROU 624
## Name: NOC, dtype: int64
## Medal Bronze Gold Silver totals
## NOC
## USA 1052.0 2088.0 1195.0 4335.0
## URS 584.0 838.0 627.0 2049.0
## GBR 505.0 498.0 591.0 1594.0
## FRA 475.0 378.0 461.0 1314.0
## ITA 374.0 460.0 394.0 1228.0
## GER 454.0 407.0 350.0 1211.0
## AUS 413.0 293.0 369.0 1075.0
## HUN 345.0 400.0 308.0 1053.0
## SWE 325.0 347.0 349.0 1021.0
## GDR 225.0 329.0 271.0 825.0
## NED 320.0 212.0 250.0 782.0
## JPN 270.0 206.0 228.0 704.0
## CHN 193.0 234.0 252.0 679.0
## RUS 240.0 192.0 206.0 638.0
## ROU 282.0 155.0 187.0 624.0
## Event_gender Gender
## 0 M Men
## 348 X Men
## 416 W Women
## 639 X Women
## 23675 W Men
## City Edition Sport Discipline Athlete NOC Event \
## Event_gender Gender
## M Men 20067 20067 20067 20067 20067 20067 20067
## W Men 1 1 1 1 1 1 1
## Women 7277 7277 7277 7277 7277 7277 7277
## X Men 1653 1653 1653 1653 1653 1653 1653
## Women 218 218 218 218 218 218 218
##
## Medal
## Event_gender Gender
## M Men 20067
## W Men 1
## Women 7277
## X Men 1653
## Women 218
## City Edition Sport Discipline Athlete NOC Gender \
## 23675 Sydney 2000 Athletics Athletics CHEPCHUMBA, Joyce KEN Men
##
## Event Event_gender Medal
## 23675 marathon W Bronze
## NOC
## USA 34
## GBR 31
## FRA 28
## GER 26
## CHN 24
## AUS 22
## ESP 22
## CAN 22
## SWE 21
## URS 21
## ITA 21
## NED 20
## RUS 20
## JPN 20
## DEN 19
## Name: Sport, dtype: int64
## NOC
## URS 21
## USA 20
## Name: Sport, dtype: int64
## URS 8
## USA 2
## dtype: int64
Summer Olympics - USA Medals:
Summer Olympics - USA Medals:
Summer Olympics - USA Medals:
Chapter 1 - Preparing data
Reading multiple data files - many tools such as pd.read_csv(), pd.read_excel(), pd.read_html(), pd.read_json():
Reindexing DataFrames - essential for combining DataFrames, since indices are the means by which DataFrames are combined:
Arithmetic with Series and DataFrames - generally, scalar operations can be broadcast in Python:
Example code includes:
myPath = "./PythonInputFiles/"
# Import pandas
import pandas as pd
medals = pd.read_csv(myPath + "summerOlympics_Medalists_1896_2008.csv", header=4)
# Read 'Bronze.csv' into a DataFrame: bronze
# bronze = pd.read_csv("Bronze.csv")
bronze = medals.loc[medals["Medal"] == "Bronze"]
# Read 'Silver.csv' into a DataFrame: silver
# silver = pd.read_csv("Silver.csv")
silver = medals.loc[medals["Medal"] == "Silver"]
# Read 'Gold.csv' into a DataFrame: gold
# gold = pd.read_csv("Gold.csv")
gold = medals.loc[medals["Medal"] == "Gold"]
# Print the first five rows of gold
print(gold.head())
bronze.to_csv(myPath + "olymBronze.csv", index=False)
silver.to_csv(myPath + "olymSilver.csv", index=False)
gold.to_csv(myPath + "olymGold.csv", index=False)
# One time only - for use in next section
# bronze[["NOC", "Athlete"]].groupby("NOC").count().sort_values("Athlete", ascending=False).iloc[0:5, :].to_csv(myPath + "bronze_top5.csv")
# silver[["NOC", "Athlete"]].groupby("NOC").count().sort_values("Athlete", ascending=False).iloc[0:5, :].to_csv(myPath + "silver_top5.csv")
# gold[["NOC", "Athlete"]].groupby("NOC").count().sort_values("Athlete", ascending=False).iloc[0:5, :].to_csv(myPath + "gold_top5.csv")
# Create the list of file names: filenames
filenames = ['olymGold.csv', 'olymSilver.csv', 'olymBronze.csv']
# Create the list of three DataFrames: dataframes
dataframes = []
for filename in filenames:
dataframes.append(pd.read_csv(myPath + filename, encoding="latin-1"))
# Print top 5 rows of 1st DataFrame in dataframes
print(dataframes[0].head())
uqNOC = set(list(gold["NOC"].unique()) + list(silver["NOC"].unique()) + list(bronze["NOC"].unique()))
totGold = gold["NOC"].value_counts()
totSilver = silver["NOC"].value_counts()
totBronze = bronze["NOC"].value_counts()
totDF = pd.DataFrame( {"Gold":totGold, "Silver":totSilver, "Bronze":totBronze} ).fillna(0)
totDF["Total"] = totDF["Gold"] + totDF["Silver"] + totDF["Bronze"]
totDF = totDF[["Total", "Gold", "Silver", "Bronze"]]
totDF = totDF.sort_values("Total", ascending=False)
print(totDF.head(20))
# The sole variable is called "Max TemperatureF" with the index being called "Month"
maxTemps = [68, 60, 68, 84, 88, 89, 91, 86, 90, 84, 72, 68]
maxIndex = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
# Read 'monthly_max_temp.csv' into a DataFrame: weather1
# weather1 = pd.read_csv('monthly_max_temp.csv', index_col="Month")
weather1 = pd.DataFrame( {"Max TemperatureF":maxTemps}, index=maxIndex )
# Print the head of weather1
print(weather1.head())
# Sort the index of weather1 in alphabetical order: weather2
weather2 = weather1.sort_index()
# Print the head of weather2
print(weather2.head())
# Sort the index of weather1 in reverse alphabetical order: weather3
weather3 = weather1.sort_index(ascending=False)
# Print the head of weather3
print(weather3.head())
# Sort weather1 numerically using the values of 'Max TemperatureF': weather4
weather4 = weather1.sort_values("Max TemperatureF")
# Print the head of weather4
print(weather4.head())
# The variable is called "Mean TemperatureF" and the indexing is run by "Month"
# The dataset is then called weather1
meanTemps = [61.956043956043956, 32.133333333333333, 68.934782608695656, 43.434782608695649]
meanIndex = ["Apr", "Jan", "Jul", "Oct"]
year = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
weather1 = pd.DataFrame( {"Mean TemperatureF":meanTemps}, index=meanIndex )
print(weather1.head())
# Reindex weather1 using the list year: weather2
weather2 = weather1.reindex(year)
# Print weather2
print(weather2)
# Reindex weather1 using the list year with forward-fill: weather3
weather3 = weather1.reindex(year).ffill()
# Print weather3
print(weather3)
# Baby names data is from https://www.data.gov/developers/baby-names-dataset/
yob1881 = pd.read_csv(myPath + "yob1881.txt", header=None)
yob1981 = pd.read_csv(myPath + "yob1981.txt", header=None)
yob1881.columns = ["Name", "Gender", "Count"]
yob1981.columns = ["Name", "Gender", "Count"]
yob1881 = yob1881.set_index("Name").sort_values("Count", ascending=False)
yob1981 = yob1981.set_index("Name").sort_values("Count", ascending=False)
print(yob1881.shape)
print(yob1981.shape)
print(yob1881.head(12))
print(yob1981.head(12))
# Reindex names_1981 with index of names_1881: common_names
# Take only top-200 names by year
pop1881 = yob1881.iloc[0:200, :]
pop1981 = yob1981.iloc[0:200, :]
common_names = pop1981.reindex(pop1881.index)
# Print shape of common_names
print(common_names.shape)
print(common_names.head(12))
# Drop rows with null counts: common_names
common_names = common_names.dropna()
# Print shape of new common_names
print(common_names.shape)
print(common_names.head(12))
# weather is 365x22 representing 2013 Pittsburgh weather data from Weather Underground
# Used package "weatherData" to grab this from R
# KPIT2013 <- weatherData::getWeatherForDate("KPIT", "2013-01-01", "2013-12-31", opt_all_columns = TRUE)
# write.csv(KPIT2013, "./PythonInputFiles/KPIT2013.csv", row.names=FALSE)
weather = pd.read_csv(myPath + "KPIT2013.csv")
# Extract selected columns from weather as new DataFrame: temps_f
temps_f = weather[['Min_TemperatureF', 'Mean_TemperatureF', 'Max_TemperatureF']]
# Convert temps_f to celsius: temps_c
temps_c = (temps_f - 32) * (5/9)
# Rename 'F' in column names with 'C': temps_c.columns
temps_c.columns = temps_c.columns.str.replace("F", "C")
# Print first 5 rows of temps_c
print(temps_c.head())
# Quarterly US GDP data from 1947-01-01 to 2016-04-01
# Downloaded from https://fred.stlouisfed.org/series/GDP as myPath + "US_GDP_1947_2016_StLouisFRED.csv"
# Read 'GDP.csv' into a DataFrame: gdp
gdp = pd.read_csv(myPath + "US_GDP_1947_2016_StLouisFRED.csv", parse_dates=True, index_col="DATE")
# Slice all the gdp data from 2008 onward: post2008
post2008 = gdp.loc["2008-01-01":, :]
# Print the last 8 rows of post2008
print(post2008.tail(8))
# Resample post2008 by year, keeping last(): yearly
yearly = post2008.resample("A").last()
# Print yearly
print(yearly)
# Compute percentage growth of yearly: yearly['growth']
yearly['growth'] = yearly.pct_change()*100
# Print yearly again
print(yearly)
# Import pandas
# import pandas as pd
# Read 'sp500.csv' into a DataFrame: sp500
# sp500 = pd.read_csv("sp500.csv", parse_dates=True, index_col="Date")
# Read 'exchange.csv' into a DataFrame: exchange
# exchange = pd.read_csv("exchange.csv", parse_dates=True, index_col="Date")
# Subset 'Open' & 'Close' columns from sp500: dollars
# dollars = sp500.loc[:, ["Open", "Close"]]
# Print the head of dollars
# print(dollars.head())
# Convert dollars to pounds: pounds
# pounds = dollars.multiply(exchange["GBP/USD"], axis="rows")
# Print the head of pounds
# print(pounds.head())
## City Edition Sport Discipline Athlete NOC Gender \
## 0 Athens 1896 Aquatics Swimming HAJOS, Alfred HUN Men
## 3 Athens 1896 Aquatics Swimming MALOKINIS, Ioannis GRE Men
## 6 Athens 1896 Aquatics Swimming HAJOS, Alfred HUN Men
## 9 Athens 1896 Aquatics Swimming NEUMANN, Paul AUT Men
## 13 Athens 1896 Athletics Athletics BURKE, Thomas USA Men
##
## Event Event_gender Medal
## 0 100m freestyle M Gold
## 3 100m freestyle for sailors M Gold
## 6 1200m freestyle M Gold
## 9 400m freestyle M Gold
## 13 100m M Gold
## City Edition Sport Discipline Athlete NOC Gender \
## 0 Athens 1896 Aquatics Swimming HAJOS, Alfred HUN Men
## 1 Athens 1896 Aquatics Swimming MALOKINIS, Ioannis GRE Men
## 2 Athens 1896 Aquatics Swimming HAJOS, Alfred HUN Men
## 3 Athens 1896 Aquatics Swimming NEUMANN, Paul AUT Men
## 4 Athens 1896 Athletics Athletics BURKE, Thomas USA Men
##
## Event Event_gender Medal
## 0 100m freestyle M Gold
## 1 100m freestyle for sailors M Gold
## 2 1200m freestyle M Gold
## 3 400m freestyle M Gold
## 4 100m M Gold
## Total Gold Silver Bronze
## USA 4335.0 2088.0 1195.0 1052.0
## URS 2049.0 838.0 627.0 584.0
## GBR 1594.0 498.0 591.0 505.0
## FRA 1314.0 378.0 461.0 475.0
## ITA 1228.0 460.0 394.0 374.0
## GER 1211.0 407.0 350.0 454.0
## AUS 1075.0 293.0 369.0 413.0
## HUN 1053.0 400.0 308.0 345.0
## SWE 1021.0 347.0 349.0 325.0
## GDR 825.0 329.0 271.0 225.0
## NED 782.0 212.0 250.0 320.0
## JPN 704.0 206.0 228.0 270.0
## CHN 679.0 234.0 252.0 193.0
## RUS 638.0 192.0 206.0 240.0
## ROU 624.0 155.0 187.0 282.0
## CAN 592.0 154.0 211.0 227.0
## NOR 537.0 194.0 199.0 144.0
## POL 499.0 103.0 173.0 223.0
## DEN 491.0 147.0 192.0 152.0
## FRG 490.0 143.0 167.0 180.0
## Max TemperatureF
## Jan 68
## Feb 60
## Mar 68
## Apr 84
## May 88
## Max TemperatureF
## Apr 84
## Aug 86
## Dec 68
## Feb 60
## Jan 68
## Max TemperatureF
## Sep 90
## Oct 84
## Nov 72
## May 88
## Mar 68
## Max TemperatureF
## Feb 60
## Jan 68
## Mar 68
## Dec 68
## Nov 72
## Mean TemperatureF
## Apr 61.956044
## Jan 32.133333
## Jul 68.934783
## Oct 43.434783
## Mean TemperatureF
## Jan 32.133333
## Feb NaN
## Mar NaN
## Apr 61.956044
## May NaN
## Jun NaN
## Jul 68.934783
## Aug NaN
## Sep NaN
## Oct 43.434783
## Nov NaN
## Dec NaN
## Mean TemperatureF
## Jan 32.133333
## Feb 32.133333
## Mar 32.133333
## Apr 61.956044
## May 61.956044
## Jun 61.956044
## Jul 68.934783
## Aug 68.934783
## Sep 68.934783
## Oct 43.434783
## Nov 43.434783
## Dec 43.434783
## (1935, 2)
## (19471, 2)
## Gender Count
## Name
## John M 8769
## William M 8524
## Mary F 6919
## James M 5441
## George M 4664
## Charles M 4636
## Frank M 2834
## Anna F 2698
## Joseph M 2456
## Henry M 2339
## Thomas M 2282
## Edward M 2177
## Gender Count
## Name
## Michael M 68765
## Jennifer F 57046
## Christopher M 50228
## Matthew M 43324
## Jessica F 42530
## Jason M 41926
## David M 40647
## Joshua M 39054
## James M 38307
## John M 34881
## Robert M 34396
## Amanda F 34372
## (200, 2)
## Gender Count
## Name
## John M 34881.0
## William M 24803.0
## Mary F 11040.0
## James M 38307.0
## George M 5159.0
## Charles M 14428.0
## Frank M 3637.0
## Anna F 5189.0
## Joseph M 30771.0
## Henry NaN NaN
## Thomas M 17165.0
## Edward M 6657.0
## (42, 2)
## Gender Count
## Name
## John M 34881.0
## William M 24803.0
## Mary F 11040.0
## James M 38307.0
## George M 5159.0
## Charles M 14428.0
## Frank M 3637.0
## Anna F 5189.0
## Joseph M 30771.0
## Thomas M 17165.0
## Edward M 6657.0
## Robert M 34396.0
## Min_TemperatureC Mean_TemperatureC Max_TemperatureC
## 0 -6.111111 -2.777778 0.000000
## 1 -10.000000 -6.666667 -3.888889
## 2 -14.444444 -6.666667 0.555556
## 3 -3.333333 -1.666667 0.000000
## 4 -4.444444 -1.111111 1.666667
## GDP
## DATE
## 2015-04-01 17998.3
## 2015-07-01 18141.9
## 2015-10-01 18222.8
## 2016-01-01 18281.6
## 2016-04-01 18450.1
## 2016-07-01 18675.3
## 2016-10-01 18869.4
## 2017-01-01 19027.6
## GDP
## DATE
## 2008-12-31 14549.9
## 2009-12-31 14566.5
## 2010-12-31 15230.2
## 2011-12-31 15785.3
## 2012-12-31 16297.3
## 2013-12-31 16999.9
## 2014-12-31 17692.2
## 2015-12-31 18222.8
## 2016-12-31 18869.4
## 2017-12-31 19027.6
## GDP growth
## DATE
## 2008-12-31 14549.9 NaN
## 2009-12-31 14566.5 0.114090
## 2010-12-31 15230.2 4.556345
## 2011-12-31 15785.3 3.644732
## 2012-12-31 16297.3 3.243524
## 2013-12-31 16999.9 4.311144
## 2014-12-31 17692.2 4.072377
## 2015-12-31 18222.8 2.999062
## 2016-12-31 18869.4 3.548302
## 2017-12-31 19027.6 0.838394
Chapter 2 - Concatenating Data
Appending and concatenating Series - using .append() or pd.concat():
Appending and concatenating DataFrames:
Concatenation, keys, and MultiIndexes:
Outer and Inner Joins:
Example code includes:
myPath = "./PythonInputFiles/"
import pandas as pd
import numpy as np
import random
# Do not have these .csv files
# Created dummy data and saved .csv to myPath
# keyDates = pd.date_range("2015-01-01", "2015-03-31")
# utHardware = [random.randint(2, 10) for p in range(len(keyDates))]
# utSoftware = [random.randint(1, 50) for p in range(len(keyDates))]
# utService = [random.randint(0, 200) for p in range(len(keyDates))]
# totSales = pd.DataFrame( {"Date":[str(x).split()[0] for x in keyDates], "Hardware":utHardware, "Software":utSoftware, "Service":utService } )
# totSales["Units"] = totSales["Hardware"] + totSales["Software"] + totSales["Service"]
# totSales["Company"] = ["A", "B", "C"] * 30
# totSales.iloc[:31, :].to_csv(myPath + "sales-jan-2015.csv", index=False)
# totSales.iloc[31:59, :].to_csv(myPath + "sales-feb-2015.csv", index=False)
# totSales.iloc[59:, :].to_csv(myPath + "sales-mar-2015.csv", index=False)
# Load 'sales-jan-2015.csv' into a DataFrame: jan
jan = pd.read_csv(myPath + "sales-jan-2015.csv", parse_dates=True, index_col="Date")
# Load 'sales-feb-2015.csv' into a DataFrame: feb
feb = pd.read_csv(myPath + "sales-feb-2015.csv", parse_dates=True, index_col="Date")
# Load 'sales-mar-2015.csv' into a DataFrame: mar
mar = pd.read_csv(myPath + "sales-mar-2015.csv", parse_dates=True, index_col="Date")
# Extract the 'Units' column from jan: jan_units
jan_units = jan['Units']
# Extract the 'Units' column from feb: feb_units
feb_units = feb['Units']
# Extract the 'Units' column from mar: mar_units
mar_units = mar['Units']
# Append feb_units and then mar_units to jan_units: quarter1
quarter1 = jan_units.append(feb_units).append(mar_units)
# Print the first slice from quarter1
print(quarter1.loc['jan 27, 2015':'feb 2, 2015'])
# Print the second slice from quarter1
print(quarter1.loc['feb 26, 2015':'mar 7, 2015'])
# Compute & print total sales in quarter1
print(quarter1.sum())
# Initialize empty list: units
units = []
# Build the list of Series
for month in [jan, feb, mar]:
units.append(month["Units"])
# Concatenate the list: quarter1
quarter1 = pd.concat(units, axis="rows")
# Print slices from quarter1
print(quarter1.loc['jan 27, 2015':'feb 2, 2015'])
print(quarter1.loc['feb 26, 2015':'mar 7, 2015'])
# Refers back to the names datasets from earlier in these chapters
yob1881 = pd.read_csv(myPath + "yob1881.txt", header=None)
yob1981 = pd.read_csv(myPath + "yob1981.txt", header=None)
yob1881.columns = ["Name", "Gender", "Count"]
yob1981.columns = ["Name", "Gender", "Count"]
names_1881 = yob1881.sort_values("Count", ascending=False)
names_1981 = yob1981.sort_values("Count", ascending=False)
# Add 'year' column to names_1881 and names_1981
names_1881['year'] = 1881
names_1981['year'] = 1981
# Append names_1981 after names_1881 with ignore_index=True: combined_names
combined_names = names_1881.append(names_1981, ignore_index=True)
# Print shapes of names_1981, names_1881, and combined_names
print(names_1981.shape)
print(names_1881.shape)
print(combined_names.shape)
# Print all rows that contain the name 'Morgan'
print(combined_names.loc[combined_names["Name"].str.contains("Morgan"), :])
# These data are the 4x1 of quarterly data from above in this workbook (Mean is actually the 12x1 with Max being the 4x1)
# The sole variable is called "Max TemperatureF" with the index being called "Month"
maxTemps = [68, 60, 68, 84, 88, 89, 91, 86, 90, 84, 72, 68]
maxIndex = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
meanTemps = [61.956043956043956, 32.133333333333333, 68.934782608695656, 43.434782608695649]
meanIndex = ["Apr", "Jan", "Jul", "Oct"]
weather_max = pd.DataFrame( {"Max TemperatureF":maxTemps}, index=maxIndex)
weather_mean = pd.DataFrame( {"Mean TemperatureF":meanTemps}, index=meanIndex)
# Concatenate weather_max and weather_mean horizontally: weather
weather = pd.concat([weather_max, weather_mean], axis=1).reindex(weather_max.index)
# Print weather
print(weather)
# This uses the Olympics medal datasets from previous
medal_types = ['bronze', 'silver', 'gold']
medals = []
for medal in medal_types:
# Create the file name: file_name
file_name = myPath + "%s_top5.csv" % medal # Note that the %s followed later by % medal means to replace the %s with the value of medal
# Create list of column names: columns
columns = ['Country', medal]
# Read file_name into a DataFrame: df
medal_df = pd.read_csv(file_name, header=0, index_col="Country", names=columns)
# Append medal_df to medals
medals.append(medal_df)
# Concatenate medals horizontally: medals
medals = pd.concat(medals, axis="columns")
# Print medals
print(medals)
medals = []
for medal in medal_types:
file_name = myPath + "%s_top5.csv" % medal
# Read file_name into a DataFrame: medal_df
medal_df = pd.read_csv(file_name, index_col="NOC")
# Append medal_df to medals
medals.append(medal_df)
# Concatenate medals: medals
medals = pd.concat(medals, keys=['bronze', 'silver', 'gold'])
# Print medals in entirety
print(medals)
# Sort the entries of medals: medals_sorted
medals_sorted = medals.sort_index(level=0)
# Print the number of Bronze medals won by Germany
print(medals_sorted.loc[('bronze','GER')])
# Print data about silver medals
print(medals_sorted.loc['silver'])
# Create alias for pd.IndexSlice: idx
idx = pd.IndexSlice
# Print all the data on medals won by the United Kingdom
print(medals_sorted.loc[idx[:,'GBR'], :])
# DO NOT HAVE THESE FILES - PROBABLY LINKED TO THE "sales" INPUTS FROM ABOVE
# Concatenate dataframes: february
# february = pd.concat(dataframes, axis=1, keys=['Hardware', 'Software', 'Service'])
# Print february.info()
# print(february.info())
# Assign pd.IndexSlice: idx
# idx = pd.IndexSlice
# Create the slice: slice_2_8
# slice_2_8 = february.loc['2015-02-02':'2015-02-08', idx[:, 'Company']]
# Print slice_2_8
# print(slice_2_8)
# CONTINUES TO BE jan/feb/mar FROM PREVIOUS "sales" INPUTS
# Make the list of tuples: month_list
month_list = [('january', jan), ('february', feb), ('march', mar)]
# Create an empty dictionary: month_dict
month_dict = {}
for month_name, month_data in month_list:
# Group month_data: month_dict[month_name]
month_dict[month_name] = month_data.groupby("Company").sum()
# Concatenate data in month_dict: sales
sales = pd.concat(month_dict)
# Print sales
print(sales)
# Print all sales by 'A'
idx = pd.IndexSlice
print(sales.loc[idx[:, 'A'], :])
# Again, the Olympics datasets (specifically, top-5 by medal type)
bronze_top5=pd.read_csv(myPath + "bronze_top5.csv", index_col="NOC")
silver_top5=pd.read_csv(myPath + "silver_top5.csv", index_col="NOC")
gold_top5=pd.read_csv(myPath + "gold_top5.csv", index_col="NOC")
# Create the list of DataFrames: medal_list
medal_list = [bronze_top5, silver_top5, gold_top5]
# Concatenate medal_list horizontally using an inner join: medals
medals = pd.concat(medal_list, axis=1, join="inner", keys=['bronze', 'silver', 'gold'])
medals.columns = ['bronze', 'silver', 'gold']
# Print medals
print(medals)
# US is quartely GDP starting 1947
# China is annual GDP starting 1966
# Resample and tidy china: china_annual
# china_annual = china.resample("A").pct_change(10).dropna()
# Resample and tidy us: us_annual
# us_annual = us.resample("A").pct_change(10).dropna()
# Concatenate china_annual and us_annual: gdp
# gdp = pd.concat([china_annual, us_annual], join="inner", axis=1)
# Resample gdp and print
# print(gdp.resample('10A').last())
## Date
## 2015-01-27 200
## 2015-01-28 223
## 2015-01-29 176
## 2015-01-30 124
## 2015-01-31 116
## 2015-02-01 116
## 2015-02-02 168
## Name: Units, dtype: int64
## Date
## 2015-02-26 234
## 2015-02-27 203
## 2015-02-28 118
## 2015-03-01 136
## 2015-03-02 31
## 2015-03-03 191
## 2015-03-04 80
## 2015-03-05 38
## 2015-03-06 111
## 2015-03-07 129
## Name: Units, dtype: int64
## 11979
## Date
## 2015-01-27 200
## 2015-01-28 223
## 2015-01-29 176
## 2015-01-30 124
## 2015-01-31 116
## 2015-02-01 116
## 2015-02-02 168
## Name: Units, dtype: int64
## Date
## 2015-02-26 234
## 2015-02-27 203
## 2015-02-28 118
## 2015-03-01 136
## 2015-03-02 31
## 2015-03-03 191
## 2015-03-04 80
## 2015-03-05 38
## 2015-03-06 111
## 2015-03-07 129
## Name: Units, dtype: int64
## (19471, 4)
## (1935, 4)
## (21406, 4)
## Name Gender Count year
## 680 Morgan M 23 1881
## 2249 Morgan F 1769 1981
## 2521 Morgan M 766 1981
## 10117 Morgana F 14 1981
## 13078 Morgann F 9 1981
## 19844 Morganne F 5 1981
## Max TemperatureF Mean TemperatureF
## Jan 68 32.133333
## Feb 60 NaN
## Mar 68 NaN
## Apr 84 61.956044
## May 88 NaN
## Jun 89 NaN
## Jul 91 68.934783
## Aug 86 NaN
## Sep 90 NaN
## Oct 84 43.434783
## Nov 72 NaN
## Dec 68 NaN
## bronze silver gold
## FRA 475.0 461.0 NaN
## GBR 505.0 591.0 498.0
## GER 454.0 NaN 407.0
## ITA NaN 394.0 460.0
## URS 584.0 627.0 838.0
## USA 1052.0 1195.0 2088.0
## Athlete
## NOC
## bronze USA 1052
## URS 584
## GBR 505
## FRA 475
## GER 454
## silver USA 1195
## URS 627
## GBR 591
## FRA 461
## ITA 394
## gold USA 2088
## URS 838
## GBR 498
## ITA 460
## GER 407
## Athlete 454
## Name: (bronze, GER), dtype: int64
## Athlete
## NOC
## FRA 461
## GBR 591
## ITA 394
## URS 627
## USA 1195
## Athlete
## NOC
## bronze GBR 505
## gold GBR 498
## silver GBR 591
## Hardware Service Software Units
## Company
## february A 47 986 210 1243
## B 70 1092 242 1404
## C 41 966 189 1196
## january A 72 1133 252 1457
## B 68 1117 188 1373
## C 50 1037 277 1364
## march A 66 667 247 980
## B 56 1137 303 1496
## C 65 1139 262 1466
## Hardware Service Software Units
## Company
## february A 47 986 210 1243
## january A 72 1133 252 1457
## march A 66 667 247 980
## bronze silver gold
## NOC
## USA 1052 1195 2088
## URS 584 627 838
## GBR 505 591 498
Chapter 3 - Merging Data
Merging DataFrames - an extension of concatenation that allows for merging on things other than the index:
Joining DataFrames - various types of joins, and implications on processing efficency:
Ordered merges - DataFrames where the underlying data has a natural order (such as time series data):
Example code includes:
myPath = "./PythonInputFiles/"
import pandas as pd
revenue = pd.DataFrame({"branch_id" : [10, 20, 30, 47] , "city" : ["Austin", "Denver", "Springfield", "Mendocino"] , "revenue" : [100, 83, 4, 200] } )
managers = pd.DataFrame({"branch_id" : [10, 20, 47, 31] , "city" : ["Austin", "Denver", "Mendocino", "Springfield"] , "manager" : ["Charles", "Joel", "Brett", "Sally"] } )
# Merge revenue with managers on 'city': merge_by_city
merge_by_city = pd.merge(revenue, managers, on="city")
# Print merge_by_city
print(merge_by_city)
# Merge revenue with managers on 'branch_id': merge_by_id
merge_by_id = pd.merge(revenue, managers, on="branch_id")
# Print merge_by_id
print(merge_by_id)
revenue["state"] = ["TX", "CO", "IL", "CA"]
managers["state"] = ["TX", "CO", "CA", "MO"]
managers=managers.iloc[:, [1, 0, 2, 3]]
managers.columns = ["branch", "branch_id", "manager", "state"]
# Merge revenue & managers on 'city' & 'branch': combined
combined = pd.merge(revenue, managers, left_on="city", right_on="branch")
# Print combined
print(combined)
# Add 'state' column to revenue: revenue['state']
# revenue['state'] = ['TX','CO','IL','CA'] # already handled above
# Add 'state' column to managers: managers['state']
# managers['state'] = ['TX','CO','CA','MO'] # already handled above
managers = managers.iloc[:, [1, 0, 2, 3]] # get back to how it was
managers.columns = ["branch_id", "city", "manager", "state"]
# Merge revenue & managers on 'branch_id', 'city', & 'state': combined
combined = pd.merge(revenue, managers, on=["branch_id", "city", "state"])
# Print combined
print(combined)
sales = pd.DataFrame( { "city" : ["Mendocino", "Denver", "Austin", "Springield", "Springfield"] , "state" : ["CA", "CO", "TX", "MO", "IL"] , "units" : [1, 4, 2, 5, 1] } )
managers=managers.iloc[:, [1, 0, 2, 3]]
managers.columns = ["branch", "branch_id", "manager", "state"]
# Merge revenue and sales: revenue_and_sales
revenue_and_sales = pd.merge(revenue, sales, how="right", on=['city', 'state'])
# Print revenue_and_sales
print(revenue_and_sales)
# Merge sales and managers: sales_and_managers
sales_and_managers = pd.merge(sales, managers, how="left", left_on=['city', 'state'], right_on=['branch', 'state'])
# Print sales_and_managers
print(sales_and_managers)
# Perform the first merge: merge_default
merge_default = pd.merge(sales_and_managers, revenue_and_sales)
# Print merge_default
print(merge_default)
# Perform the second merge: merge_outer
merge_outer = pd.merge(sales_and_managers, revenue_and_sales, how="outer")
# Print merge_outer
print(merge_outer)
# Perform the third merge: merge_outer_on
merge_outer_on = pd.merge(sales_and_managers, revenue_and_sales, on=['city','state'], how="outer")
# Print merge_outer_on
print(merge_outer_on)
austin = pd.DataFrame( { "date":pd.to_datetime(["2016-01-01", "2016-02-08", "2016-01-17"]), "ratings" : ["Cloudy", "Cloudy", "Sunny"] } )
houston = pd.DataFrame( { "date":pd.to_datetime(["2016-01-04", "2016-01-01", "2016-03-01"]), "ratings" : ["Rainy", "Cloudy", "Sunny"] } )
# Perform the first ordered merge: tx_weather
tx_weather = pd.merge_ordered(austin, houston)
# Print tx_weather
print(tx_weather)
# Perform the second ordered merge: tx_weather_suff
tx_weather_suff = pd.merge_ordered(austin, houston, on="date", suffixes=['_aus','_hus'])
# Print tx_weather_suff
print(tx_weather_suff)
# Perform the third ordered merge: tx_weather_ffill
tx_weather_ffill = pd.merge_ordered(austin, houston, on="date", suffixes=['_aus','_hus'], fill_method="ffill")
# Print tx_weather_ffill
print(tx_weather_ffill)
# Similar to pd.merge_ordered(), the pd.merge_asof() function will also merge values in order using the on column, but for each row in the left DataFrame, only rows from the right DataFrame whose 'on' column values are less than the left value will be kept.
# DO NOT HAVE THESE DATASETS
# Merge auto and oil: merged
# merged = pd.merge_asof(auto, oil, left_on="yr", right_on="Date")
# Print the tail of merged
# print(merged.tail())
# Resample merged: yearly
# yearly = merged.resample("A", on="Date")[['mpg','Price']].mean()
# Print yearly
# print(yearly)
# print yearly.corr()
# print(yearly.corr())
## branch_id_x city revenue branch_id_y manager
## 0 10 Austin 100 10 Charles
## 1 20 Denver 83 20 Joel
## 2 30 Springfield 4 31 Sally
## 3 47 Mendocino 200 47 Brett
## branch_id city_x revenue city_y manager
## 0 10 Austin 100 Austin Charles
## 1 20 Denver 83 Denver Joel
## 2 47 Mendocino 200 Mendocino Brett
## branch_id_x city revenue state_x branch branch_id_y \
## 0 10 Austin 100 TX Austin 10
## 1 20 Denver 83 CO Denver 20
## 2 30 Springfield 4 IL Springfield 31
## 3 47 Mendocino 200 CA Mendocino 47
##
## manager state_y
## 0 Charles TX
## 1 Joel CO
## 2 Sally MO
## 3 Brett CA
## branch_id city revenue state manager
## 0 10 Austin 100 TX Charles
## 1 20 Denver 83 CO Joel
## 2 47 Mendocino 200 CA Brett
## branch_id city revenue state units
## 0 10.0 Austin 100.0 TX 2
## 1 20.0 Denver 83.0 CO 4
## 2 30.0 Springfield 4.0 IL 1
## 3 47.0 Mendocino 200.0 CA 1
## 4 NaN Springield NaN MO 5
## city state units branch branch_id manager
## 0 Mendocino CA 1 Mendocino 47.0 Brett
## 1 Denver CO 4 Denver 20.0 Joel
## 2 Austin TX 2 Austin 10.0 Charles
## 3 Springield MO 5 NaN NaN NaN
## 4 Springfield IL 1 NaN NaN NaN
## city state units branch branch_id manager revenue
## 0 Mendocino CA 1 Mendocino 47.0 Brett 200.0
## 1 Denver CO 4 Denver 20.0 Joel 83.0
## 2 Austin TX 2 Austin 10.0 Charles 100.0
## 3 Springield MO 5 NaN NaN NaN NaN
## city state units branch branch_id manager revenue
## 0 Mendocino CA 1 Mendocino 47.0 Brett 200.0
## 1 Denver CO 4 Denver 20.0 Joel 83.0
## 2 Austin TX 2 Austin 10.0 Charles 100.0
## 3 Springield MO 5 NaN NaN NaN NaN
## 4 Springfield IL 1 NaN NaN NaN NaN
## 5 Springfield IL 1 NaN 30.0 NaN 4.0
## city state units_x branch branch_id_x manager branch_id_y \
## 0 Mendocino CA 1 Mendocino 47.0 Brett 47.0
## 1 Denver CO 4 Denver 20.0 Joel 20.0
## 2 Austin TX 2 Austin 10.0 Charles 10.0
## 3 Springield MO 5 NaN NaN NaN NaN
## 4 Springfield IL 1 NaN NaN NaN 30.0
##
## revenue units_y
## 0 200.0 1
## 1 83.0 4
## 2 100.0 2
## 3 NaN 5
## 4 4.0 1
## date ratings
## 0 2016-01-01 Cloudy
## 1 2016-01-04 Rainy
## 2 2016-01-17 Sunny
## 3 2016-02-08 Cloudy
## 4 2016-03-01 Sunny
## date ratings_aus ratings_hus
## 0 2016-01-01 Cloudy Cloudy
## 1 2016-01-04 NaN Rainy
## 2 2016-01-17 Sunny NaN
## 3 2016-02-08 Cloudy NaN
## 4 2016-03-01 NaN Sunny
## date ratings_aus ratings_hus
## 0 2016-01-01 Cloudy Cloudy
## 1 2016-01-04 Cloudy Rainy
## 2 2016-01-17 Sunny Rainy
## 3 2016-02-08 Cloudy Rainy
## 4 2016-03-01 Cloudy Sunny
Chapter 4 - Case Study (Summer Olympics)
Medals in the Summer Olympics - does a country win more medals when it is the host?:
Quantifying Performance:
Reshaping and plotting:
Example code includes:
myPath = "./PythonInputFiles/"
import pandas as pd
import matplotlib.pyplot as plt
# Create files needed for reading in later
# medals = pd.read_csv(myPath + "summerOlympics_Medalists_1896_2008.csv", header=4)
# uqYears = medals["Edition"].value_counts().sort_index().index
# for x in uqYears:
# outFile = myPath + '_notuse_summer_{:d}.csv'.format(x)
# outData = medals.loc[medals["Edition"] == x]
# outData.to_csv(outFile, index=False)
#
# Create file path: file_path
file_path = myPath + "summerOlympics_Hosts_1896_2008.txt"
# Load DataFrame from file_path: editions
editions = pd.read_csv(file_path, sep="\t")
# Extract the relevant columns: editions
editions = editions[['Edition', 'Grand Total', 'City', 'Country']]
# Print editions DataFrame
print(editions)
# Create the file path: file_path
file_path = myPath + 'olympicsCountryCodes.csv'
# Load DataFrame from file_path: ioc_codes
ioc_codes = pd.read_csv(file_path)
ioc_codes.columns = ["Country", "NOC", "ISO", "Country_1"]
# Extract the relevant columns: ioc_codes
ioc_codes = ioc_codes[["Country", "NOC"]]
# Print first and last 5 rows of ioc_codes
print(ioc_codes.head())
print(ioc_codes.tail())
# Create empty dictionary: medals_dict
medals_dict = {}
for year in editions['Edition']:
# Create the file path: file_path
file_path = myPath + '_notuse_summer_{:d}.csv'.format(year)
# Load file_path into a DataFrame: medals_dict[year]
medals_dict[year] = pd.read_csv(file_path, encoding="latin-1")
# Extract relevant columns: medals_dict[year]
medals_dict[year] = medals_dict[year][['Athlete', 'NOC', 'Medal']]
# Assign year to column 'Edition' of medals_dict
medals_dict[year]['Edition'] = year
# Concatenate medals_dict: medals
medals = pd.concat(medals_dict, ignore_index=True)
# Print first and last 5 rows of medals
print(medals.head())
print(medals.tail())
# Construct the pivot_table: medal_counts
medal_counts = medals.pivot_table(index="Edition", columns="NOC", values="Athlete", aggfunc="count")
# Print the first & last 5 rows of medal_counts
print(medal_counts.head())
print(medal_counts.tail())
# Set Index of editions: totals
totals = editions.set_index("Edition")
# Reassign totals['Grand Total']: totals
totals = totals["Grand Total"]
# Divide medal_counts by totals: fractions
fractions = medal_counts.divide(totals, axis="rows")
# Print first & last 5 rows of fractions
print(fractions.head())
print(fractions.tail())
# CHECK IN TO WHAT THE .expanding() does here . . .
# Apply the expanding mean: mean_fractions
mean_fractions = fractions.expanding().mean()
# Compute the percentage change: fractions_change
fractions_change = mean_fractions.pct_change() * 100
# Reset the index of fractions_change: fractions_change
fractions_change = fractions_change.reset_index()
# Print first & last 5 rows of fractions_change
print(fractions_change.head())
print(fractions_change.tail())
# Left join editions and ioc_codes: hosts
hosts = pd.merge(editions, ioc_codes, how="left")
# Extract relevant columns and set index: hosts
hosts = hosts[["Edition", "NOC"]].set_index("Edition")
# Fix missing 'NOC' values of hosts
print(hosts.loc[hosts.NOC.isnull()])
hosts.loc[1972, 'NOC'] = 'FRG'
hosts.loc[1980, 'NOC'] = 'URS'
hosts.loc[1988, 'NOC'] = 'KOR'
# Reset Index of hosts: hosts
hosts = hosts.reset_index()
# Print hosts
print(hosts)
# Reshape fractions_change: reshaped
reshaped = pd.melt(fractions_change, id_vars="Edition", value_name="Change")
# Print reshaped.shape and fractions_change.shape
print(reshaped.shape, fractions_change.shape)
# Extract rows from reshaped where 'NOC' == 'CHN': chn
chn = reshaped[reshaped["NOC"] == "CHN"]
# Print last 5 rows of chn with .tail()
print(chn.tail())
# Merge reshaped and hosts: merged
merged = pd.merge(reshaped, hosts, how="inner")
# Print first 5 rows of merged
print(merged.head())
# Set Index of merged and sort it: influence
influence = merged.set_index("Edition").sort_index()
# Print first 5 rows of influence
print(influence.head())
# Import pyplot
import matplotlib.pyplot as plt
# Extract influence['Change']: change
change = influence["Change"]
# Make bar plot of change: ax
ax = change.plot(kind="bar")
# Customize the plot to improve readability
ax.set_ylabel("% Change of Host Country Medal Count")
ax.set_title("Is there a Host Country Advantage?")
ax.set_xticklabels(editions['City'])
# Display the plot
# plt.show()
plt.savefig("_dummyPy073.png", bbox_inches="tight")
plt.clf()
## Edition Grand Total City Country
## 0 1896 151 Athens Greece
## 1 1900 512 Paris France
## 2 1904 470 St. Louis United States
## 3 1908 804 London United Kingdom
## 4 1912 885 Stockholm Sweden
## 5 1920 1298 Antwerp Belgium
## 6 1924 884 Paris France
## 7 1928 710 Amsterdam Netherlands
## 8 1932 615 Los Angeles United States
## 9 1936 875 Berlin Germany
## 10 1948 814 London United Kingdom
## 11 1952 889 Helsinki Finland
## 12 1956 885 Melbourne Australia
## 13 1960 882 Rome Italy
## 14 1964 1010 Tokyo Japan
## 15 1968 1031 Mexico City Mexico
## 16 1972 1185 Munich West Germany (now Germany)
## 17 1976 1305 Montreal Canada
## 18 1980 1387 Moscow U.S.S.R. (now Russia)
## 19 1984 1459 Los Angeles United States
## 20 1988 1546 Seoul South Korea
## 21 1992 1705 Barcelona Spain
## 22 1996 1859 Atlanta United States
## 23 2000 2015 Sydney Australia
## 24 2004 1998 Athens Greece
## 25 2008 2042 Beijing China
## Country NOC
## 0 Afghanistan AFG
## 1 Albania ALB
## 2 Algeria ALG
## 3 American Samoa* ASA
## 4 Andorra AND
## Country NOC
## 196 Vietnam VIE
## 197 Virgin Islands* ISV
## 198 Yemen YEM
## 199 Zambia ZAM
## 200 Zimbabwe ZIM
## Athlete NOC Medal Edition
## 0 HAJOS, Alfred HUN Gold 1896
## 1 HERSCHMANN, Otto AUT Silver 1896
## 2 DRIVAS, Dimitrios GRE Bronze 1896
## 3 MALOKINIS, Ioannis GRE Gold 1896
## 4 CHASAPIS, Spiridon GRE Silver 1896
## Athlete NOC Medal Edition
## 29211 ENGLICH, Mirko GER Silver 2008
## 29212 MIZGAITIS, Mindaugas LTU Bronze 2008
## 29213 PATRIKEEV, Yuri ARM Bronze 2008
## 29214 LOPEZ, Mijain CUB Gold 2008
## 29215 BAROEV, Khasan RUS Silver 2008
## NOC AFG AHO ALG ANZ ARG ARM AUS AUT AZE BAH ... URS URU \
## Edition ...
## 1896 NaN NaN NaN NaN NaN NaN 2.0 5.0 NaN NaN ... NaN NaN
## 1900 NaN NaN NaN NaN NaN NaN 5.0 6.0 NaN NaN ... NaN NaN
## 1904 NaN NaN NaN NaN NaN NaN NaN 1.0 NaN NaN ... NaN NaN
## 1908 NaN NaN NaN 19.0 NaN NaN NaN 1.0 NaN NaN ... NaN NaN
## 1912 NaN NaN NaN 10.0 NaN NaN NaN 14.0 NaN NaN ... NaN NaN
##
## NOC USA UZB VEN VIE YUG ZAM ZIM ZZX
## Edition
## 1896 20.0 NaN NaN NaN NaN NaN NaN 6.0
## 1900 55.0 NaN NaN NaN NaN NaN NaN 34.0
## 1904 394.0 NaN NaN NaN NaN NaN NaN 8.0
## 1908 63.0 NaN NaN NaN NaN NaN NaN NaN
## 1912 101.0 NaN NaN NaN NaN NaN NaN NaN
##
## [5 rows x 138 columns]
## NOC AFG AHO ALG ANZ ARG ARM AUS AUT AZE BAH ... URS URU \
## Edition ...
## 1992 NaN NaN 2.0 NaN 2.0 NaN 57.0 6.0 NaN 1.0 ... NaN NaN
## 1996 NaN NaN 3.0 NaN 20.0 2.0 132.0 3.0 1.0 5.0 ... NaN NaN
## 2000 NaN NaN 5.0 NaN 20.0 1.0 183.0 4.0 3.0 6.0 ... NaN 1.0
## 2004 NaN NaN NaN NaN 47.0 NaN 157.0 8.0 5.0 2.0 ... NaN NaN
## 2008 1.0 NaN 2.0 NaN 51.0 6.0 149.0 3.0 7.0 5.0 ... NaN NaN
##
## NOC USA UZB VEN VIE YUG ZAM ZIM ZZX
## Edition
## 1992 224.0 NaN NaN NaN NaN NaN NaN NaN
## 1996 260.0 2.0 NaN NaN 26.0 1.0 NaN NaN
## 2000 248.0 4.0 NaN 1.0 26.0 NaN NaN NaN
## 2004 264.0 5.0 2.0 NaN NaN NaN 3.0 NaN
## 2008 315.0 6.0 1.0 1.0 NaN NaN 4.0 NaN
##
## [5 rows x 138 columns]
## NOC AFG AHO ALG ANZ ARG ARM AUS AUT AZE BAH \
## Edition
## 1896 NaN NaN NaN NaN NaN NaN 0.013245 0.033113 NaN NaN
## 1900 NaN NaN NaN NaN NaN NaN 0.009766 0.011719 NaN NaN
## 1904 NaN NaN NaN NaN NaN NaN NaN 0.002128 NaN NaN
## 1908 NaN NaN NaN 0.023632 NaN NaN NaN 0.001244 NaN NaN
## 1912 NaN NaN NaN 0.011299 NaN NaN NaN 0.015819 NaN NaN
##
## NOC ... URS URU USA UZB VEN VIE YUG ZAM ZIM ZZX
## Edition ...
## 1896 ... NaN NaN 0.132450 NaN NaN NaN NaN NaN NaN 0.039735
## 1900 ... NaN NaN 0.107422 NaN NaN NaN NaN NaN NaN 0.066406
## 1904 ... NaN NaN 0.838298 NaN NaN NaN NaN NaN NaN 0.017021
## 1908 ... NaN NaN 0.078358 NaN NaN NaN NaN NaN NaN NaN
## 1912 ... NaN NaN 0.114124 NaN NaN NaN NaN NaN NaN NaN
##
## [5 rows x 138 columns]
## NOC AFG AHO ALG ANZ ARG ARM AUS AUT \
## Edition
## 1992 NaN NaN 0.001173 NaN 0.001173 NaN 0.033431 0.003519
## 1996 NaN NaN 0.001614 NaN 0.010758 0.001076 0.071006 0.001614
## 2000 NaN NaN 0.002481 NaN 0.009926 0.000496 0.090819 0.001985
## 2004 NaN NaN NaN NaN 0.023524 NaN 0.078579 0.004004
## 2008 0.00049 NaN 0.000979 NaN 0.024976 0.002938 0.072968 0.001469
##
## NOC AZE BAH ... URS URU USA UZB VEN \
## Edition ...
## 1992 NaN 0.000587 ... NaN NaN 0.131378 NaN NaN
## 1996 0.000538 0.002690 ... NaN NaN 0.139860 0.001076 NaN
## 2000 0.001489 0.002978 ... NaN 0.000496 0.123077 0.001985 NaN
## 2004 0.002503 0.001001 ... NaN NaN 0.132132 0.002503 0.001001
## 2008 0.003428 0.002449 ... NaN NaN 0.154261 0.002938 0.000490
##
## NOC VIE YUG ZAM ZIM ZZX
## Edition
## 1992 NaN NaN NaN NaN NaN
## 1996 NaN 0.013986 0.000538 NaN NaN
## 2000 0.000496 0.012903 NaN NaN NaN
## 2004 NaN NaN NaN 0.001502 NaN
## 2008 0.000490 NaN NaN 0.001959 NaN
##
## [5 rows x 138 columns]
## NOC Edition AFG AHO ALG ANZ ARG ARM AUS AUT AZE \
## 0 1896 NaN NaN NaN NaN NaN NaN NaN NaN NaN
## 1 1900 NaN NaN NaN NaN NaN NaN -13.134766 -32.304688 NaN
## 2 1904 NaN NaN NaN NaN NaN NaN 0.000000 -30.169386 NaN
## 3 1908 NaN NaN NaN NaN NaN NaN 0.000000 -23.013510 NaN
## 4 1912 NaN NaN NaN -26.092774 NaN NaN 0.000000 6.254438 NaN
##
## NOC ... URS URU USA UZB VEN VIE YUG ZAM ZIM ZZX
## 0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
## 1 ... NaN NaN -9.448242 NaN NaN NaN NaN NaN NaN 33.561198
## 2 ... NaN NaN 199.651245 NaN NaN NaN NaN NaN NaN -22.642384
## 3 ... NaN NaN -19.549222 NaN NaN NaN NaN NaN NaN 0.000000
## 4 ... NaN NaN -12.105733 NaN NaN NaN NaN NaN NaN 0.000000
##
## [5 rows x 139 columns]
## NOC Edition AFG AHO ALG ANZ ARG ARM AUS \
## 21 1992 NaN 0.0 -7.214076 0.0 -6.767308 NaN 2.754114
## 22 1996 NaN 0.0 8.959211 0.0 1.306696 NaN 10.743275
## 23 2000 NaN 0.0 19.762488 0.0 0.515190 -26.935484 12.554986
## 24 2004 NaN 0.0 0.000000 0.0 9.625365 0.000000 8.161162
## 25 2008 NaN 0.0 -8.197807 0.0 8.588555 91.266408 6.086870
##
## NOC AUT AZE ... URS URU USA UZB VEN \
## 21 -3.034840 NaN ... 0.0 0.000000 -1.329330 NaN 0.000000
## 22 -3.876773 NaN ... 0.0 0.000000 -1.010378 NaN 0.000000
## 23 -3.464221 88.387097 ... 0.0 -12.025323 -1.341842 42.258065 0.000000
## 24 -2.186922 48.982144 ... 0.0 0.000000 -1.031922 21.170339 -1.615969
## 25 -3.389836 31.764436 ... 0.0 0.000000 -0.450031 14.610625 -6.987342
##
## NOC VIE YUG ZAM ZIM ZZX
## 21 NaN 0.000000 0.000000 0.000000 0.0
## 22 NaN -2.667732 -10.758472 0.000000 0.0
## 23 NaN -2.696445 0.000000 0.000000 0.0
## 24 0.000000 0.000000 0.000000 -43.491929 0.0
## 25 -0.661117 0.000000 0.000000 -23.316533 0.0
##
## [5 rows x 139 columns]
## NOC
## Edition
## 1972 NaN
## 1980 NaN
## 1988 NaN
## Edition NOC
## 0 1896 GRE
## 1 1900 FRA
## 2 1904 USA
## 3 1908 GBR
## 4 1912 SWE
## 5 1920 BEL
## 6 1924 FRA
## 7 1928 NED
## 8 1932 USA
## 9 1936 GER
## 10 1948 GBR
## 11 1952 FIN
## 12 1956 AUS
## 13 1960 ITA
## 14 1964 JPN
## 15 1968 MEX
## 16 1972 FRG
## 17 1976 CAN
## 18 1980 URS
## 19 1984 USA
## 20 1988 KOR
## 21 1992 ESP
## 22 1996 USA
## 23 2000 AUS
## 24 2004 GRE
## 25 2008 CHN
## (3588, 3) (26, 139)
## Edition NOC Change
## 567 1992 CHN 4.240630
## 568 1996 CHN 7.860247
## 569 2000 CHN -3.851278
## 570 2004 CHN 0.128863
## 571 2008 CHN 13.251332
## Edition NOC Change
## 0 1956 AUS 54.615063
## 1 2000 AUS 12.554986
## 2 1920 BEL 54.757887
## 3 1976 CAN -2.143977
## 4 2008 CHN 13.251332
## NOC Change
## Edition
## 1896 GRE NaN
## 1900 FRA 198.002486
## 1904 USA 199.651245
## 1908 GBR 134.489218
## 1912 SWE 71.896226
Summer Olympics - % Change in Medals (Host Country):
Chapter 1 - Basics of Relational Databases
Introduction to Databases - relational tables that store data (course features US Census data):
Connecting to Your Database - tools in SQLAlchemy, which allows for writing SQL code using Python:
Introduction to SQL - basic commands:
Example code includes:
myPath = "./PythonInputFiles/"
import pandas as pd
# Appears that the SQL file has two tables, "census" and "state_fact"
# Downloaded a different version of the file from:
# https://www.gfairchild.com/2011/12/13/2010-census-sqlite-database/
# This data contains ['counties', 'states', 'states_zctas', 'zctas']
# Import create_engine
from sqlalchemy import create_engine
# Create an engine that connects to the census.sqlite file: engine
engine = create_engine("sqlite:///" + myPath + "2010CensusPopulation.db")
# Print table names
print(engine.table_names())
from sqlalchemy import MetaData
metadata = MetaData() # I think, it has already been loaded/created in the exercises . . .
# Import Table
from sqlalchemy import Table
# Reflect census table from the engine: census (uses states instead . . . )
# census = Table("census", metadata, autoload=True, autoload_with=engine)
census = Table("states", metadata, autoload=True, autoload_with=engine)
# Print census table metadata
print(repr(census))
# Output in DataCamp example is: Table('census', MetaData(bind=None), Column('state', VARCHAR(length=30), table=<census>), Column('sex', VARCHAR(length=1), table=<census>), Column('age', INTEGER(), table=<census>), Column('pop2000', INTEGER(), table=<census>), Column('pop2008', INTEGER(), table=<census>), schema=None)
# MANY more columns using the data I have
# Reflect the census table from the engine: census (per previous, using 'states' instead)
census = Table("states", metadata, autoload=True, autoload_with=engine)
# Print the column names
print(census.columns.keys())
# Print full table metadata (per previous, using 'states' instead)
print(repr(metadata.tables["states"]))
# Build select statement for census table: stmt
# stmt = "SELECT * FROM census"
stmt = "SELECT * FROM states"
# Execute the statement and fetch the results: results
connection = engine.connect() # Create connection to the engine defined above (not sure . . . )
results = connection.execute(stmt).fetchall()
# Print Results (too long to print the entire thing)
# print(results)
print(type(results))
print(len(results))
print(results[0])
# Import select
from sqlalchemy import select
# Reflect census table via engine: census (per previous, use states instead)
# census = Table('census', metadata, autoload=True, autoload_with=engine)
census = Table('states', metadata, autoload=True, autoload_with=engine)
# Build select statement for census table: stmt
stmt = select([census])
# Print the emitted statement to see the SQL emitted
print(stmt)
# Execute the statement and print the results (WAY TOO LONG!)
# print(connection.execute(stmt).fetchall())
# Get the first row of the results by using an index: first_row
first_row = results[0]
# Print the first row of the results
print(first_row)
# Print the first column of the first row by using an index
print(first_row[0])
# Print the 'state' column of the first row by using its name
print(first_row["state"])
# Make it a sensible DataFrame
myDF = pd.DataFrame(results)
myDF.columns = census.columns.keys()
print(myDF.shape)
# Melt the data down so that gender and age are the columns
# Key by id-state
# Ax total population and gender subtotals and centroids
colNamesNo = ["centroid_longitude", "centroid_latitude", "population_total", "population_male_total", "population_female_total"]
colNumsNo = [list(myDF.columns).index(x) for x in colNamesNo]
myBasic = myDF.iloc[:, [0, 1] + colNumsNo] # [0, 1] are id-state
myPreMelt = myDF.iloc[:, [a not in colNumsNo for a in range(len(myDF.columns))]]
myMelt = myPreMelt.melt(id_vars=["id", "state"], var_name="gender_age", value_name="pop2010")
myMelt["gender"] = [x.split("_")[1] for x in myMelt["gender_age"]]
myMelt["age"] = [x.split("_")[2] for x in myMelt["gender_age"]]
print(myMelt.shape)
print(myMelt.head(10))
print(myMelt.tail(10))
print(myMelt["gender"].value_counts())
print(myMelt["age"].value_counts())
print(myMelt.info())
## ['counties', 'states', 'states_zctas', 'zctas']
## Table('states', MetaData(bind=None), Column('id', INTEGER(), table=<states>, primary_key=True, nullable=False), Column('state', TEXT(), table=<states>, nullable=False), Column('centroid_longitude', REAL(), table=<states>, nullable=False), Column('centroid_latitude', REAL(), table=<states>, nullable=False), Column('population_total', INTEGER(), table=<states>, nullable=False), Column('population_male_total', INTEGER(), table=<states>, nullable=False), Column('population_male_lt5', INTEGER(), table=<states>, nullable=False), Column('population_male_5to9', INTEGER(), table=<states>, nullable=False), Column('population_male_10to14', INTEGER(), table=<states>, nullable=False), Column('population_male_15to17', INTEGER(), table=<states>, nullable=False), Column('population_male_18to19', INTEGER(), table=<states>, nullable=False), Column('population_male_20', INTEGER(), table=<states>, nullable=False), Column('population_male_21', INTEGER(), table=<states>, nullable=False), Column('population_male_22to24', INTEGER(), table=<states>, nullable=False), Column('population_male_25to29', INTEGER(), table=<states>, nullable=False), Column('population_male_30to34', INTEGER(), table=<states>, nullable=False), Column('population_male_35to39', INTEGER(), table=<states>, nullable=False), Column('population_male_40to44', INTEGER(), table=<states>, nullable=False), Column('population_male_45to49', INTEGER(), table=<states>, nullable=False), Column('population_male_50to54', INTEGER(), table=<states>, nullable=False), Column('population_male_55to59', INTEGER(), table=<states>, nullable=False), Column('population_male_60to61', INTEGER(), table=<states>, nullable=False), Column('population_male_62to64', INTEGER(), table=<states>, nullable=False), Column('population_male_65to66', INTEGER(), table=<states>, nullable=False), Column('population_male_67to69', INTEGER(), table=<states>, nullable=False), Column('population_male_70to74', INTEGER(), table=<states>, nullable=False), Column('population_male_75to79', INTEGER(), table=<states>, nullable=False), Column('population_male_80to84', INTEGER(), table=<states>, nullable=False), Column('population_male_ge85', INTEGER(), table=<states>, nullable=False), Column('population_female_total', INTEGER(), table=<states>, nullable=False), Column('population_female_lt5', INTEGER(), table=<states>, nullable=False), Column('population_female_5to9', INTEGER(), table=<states>, nullable=False), Column('population_female_10to14', INTEGER(), table=<states>, nullable=False), Column('population_female_15to17', INTEGER(), table=<states>, nullable=False), Column('population_female_18to19', INTEGER(), table=<states>, nullable=False), Column('population_female_20', INTEGER(), table=<states>, nullable=False), Column('population_female_21', INTEGER(), table=<states>, nullable=False), Column('population_female_22to24', INTEGER(), table=<states>, nullable=False), Column('population_female_25to29', INTEGER(), table=<states>, nullable=False), Column('population_female_30to34', INTEGER(), table=<states>, nullable=False), Column('population_female_35to39', INTEGER(), table=<states>, nullable=False), Column('population_female_40to44', INTEGER(), table=<states>, nullable=False), Column('population_female_45to49', INTEGER(), table=<states>, nullable=False), Column('population_female_50to54', INTEGER(), table=<states>, nullable=False), Column('population_female_55to59', INTEGER(), table=<states>, nullable=False), Column('population_female_60to61', INTEGER(), table=<states>, nullable=False), Column('population_female_62to64', INTEGER(), table=<states>, nullable=False), Column('population_female_65to66', INTEGER(), table=<states>, nullable=False), Column('population_female_67to69', INTEGER(), table=<states>, nullable=False), Column('population_female_70to74', INTEGER(), table=<states>, nullable=False), Column('population_female_75to79', INTEGER(), table=<states>, nullable=False), Column('population_female_80to84', INTEGER(), table=<states>, nullable=False), Column('population_female_ge85', INTEGER(), table=<states>, nullable=False), schema=None)
## ['id', 'state', 'centroid_longitude', 'centroid_latitude', 'population_total', 'population_male_total', 'population_male_lt5', 'population_male_5to9', 'population_male_10to14', 'population_male_15to17', 'population_male_18to19', 'population_male_20', 'population_male_21', 'population_male_22to24', 'population_male_25to29', 'population_male_30to34', 'population_male_35to39', 'population_male_40to44', 'population_male_45to49', 'population_male_50to54', 'population_male_55to59', 'population_male_60to61', 'population_male_62to64', 'population_male_65to66', 'population_male_67to69', 'population_male_70to74', 'population_male_75to79', 'population_male_80to84', 'population_male_ge85', 'population_female_total', 'population_female_lt5', 'population_female_5to9', 'population_female_10to14', 'population_female_15to17', 'population_female_18to19', 'population_female_20', 'population_female_21', 'population_female_22to24', 'population_female_25to29', 'population_female_30to34', 'population_female_35to39', 'population_female_40to44', 'population_female_45to49', 'population_female_50to54', 'population_female_55to59', 'population_female_60to61', 'population_female_62to64', 'population_female_65to66', 'population_female_67to69', 'population_female_70to74', 'population_female_75to79', 'population_female_80to84', 'population_female_ge85']
## Table('states', MetaData(bind=None), Column('id', INTEGER(), table=<states>, primary_key=True, nullable=False), Column('state', TEXT(), table=<states>, nullable=False), Column('centroid_longitude', REAL(), table=<states>, nullable=False), Column('centroid_latitude', REAL(), table=<states>, nullable=False), Column('population_total', INTEGER(), table=<states>, nullable=False), Column('population_male_total', INTEGER(), table=<states>, nullable=False), Column('population_male_lt5', INTEGER(), table=<states>, nullable=False), Column('population_male_5to9', INTEGER(), table=<states>, nullable=False), Column('population_male_10to14', INTEGER(), table=<states>, nullable=False), Column('population_male_15to17', INTEGER(), table=<states>, nullable=False), Column('population_male_18to19', INTEGER(), table=<states>, nullable=False), Column('population_male_20', INTEGER(), table=<states>, nullable=False), Column('population_male_21', INTEGER(), table=<states>, nullable=False), Column('population_male_22to24', INTEGER(), table=<states>, nullable=False), Column('population_male_25to29', INTEGER(), table=<states>, nullable=False), Column('population_male_30to34', INTEGER(), table=<states>, nullable=False), Column('population_male_35to39', INTEGER(), table=<states>, nullable=False), Column('population_male_40to44', INTEGER(), table=<states>, nullable=False), Column('population_male_45to49', INTEGER(), table=<states>, nullable=False), Column('population_male_50to54', INTEGER(), table=<states>, nullable=False), Column('population_male_55to59', INTEGER(), table=<states>, nullable=False), Column('population_male_60to61', INTEGER(), table=<states>, nullable=False), Column('population_male_62to64', INTEGER(), table=<states>, nullable=False), Column('population_male_65to66', INTEGER(), table=<states>, nullable=False), Column('population_male_67to69', INTEGER(), table=<states>, nullable=False), Column('population_male_70to74', INTEGER(), table=<states>, nullable=False), Column('population_male_75to79', INTEGER(), table=<states>, nullable=False), Column('population_male_80to84', INTEGER(), table=<states>, nullable=False), Column('population_male_ge85', INTEGER(), table=<states>, nullable=False), Column('population_female_total', INTEGER(), table=<states>, nullable=False), Column('population_female_lt5', INTEGER(), table=<states>, nullable=False), Column('population_female_5to9', INTEGER(), table=<states>, nullable=False), Column('population_female_10to14', INTEGER(), table=<states>, nullable=False), Column('population_female_15to17', INTEGER(), table=<states>, nullable=False), Column('population_female_18to19', INTEGER(), table=<states>, nullable=False), Column('population_female_20', INTEGER(), table=<states>, nullable=False), Column('population_female_21', INTEGER(), table=<states>, nullable=False), Column('population_female_22to24', INTEGER(), table=<states>, nullable=False), Column('population_female_25to29', INTEGER(), table=<states>, nullable=False), Column('population_female_30to34', INTEGER(), table=<states>, nullable=False), Column('population_female_35to39', INTEGER(), table=<states>, nullable=False), Column('population_female_40to44', INTEGER(), table=<states>, nullable=False), Column('population_female_45to49', INTEGER(), table=<states>, nullable=False), Column('population_female_50to54', INTEGER(), table=<states>, nullable=False), Column('population_female_55to59', INTEGER(), table=<states>, nullable=False), Column('population_female_60to61', INTEGER(), table=<states>, nullable=False), Column('population_female_62to64', INTEGER(), table=<states>, nullable=False), Column('population_female_65to66', INTEGER(), table=<states>, nullable=False), Column('population_female_67to69', INTEGER(), table=<states>, nullable=False), Column('population_female_70to74', INTEGER(), table=<states>, nullable=False), Column('population_female_75to79', INTEGER(), table=<states>, nullable=False), Column('population_female_80to84', INTEGER(), table=<states>, nullable=False), Column('population_female_ge85', INTEGER(), table=<states>, nullable=False), schema=None)
## <class 'list'>
## 52
## (1, 'Wyoming', -107.5419255, 42.9918024, 563626, 287437, 20596, 19203, 18592, 11385, 8241, 4406, 4211, 12698, 21752, 18919, 17702, 17149, 19713, 22450, 20928, 7338, 9540, 5058, 6497, 8126, 5704, 4176, 3053, 276189, 19607, 18010, 17363, 10646, 7870, 3971, 3763, 11269, 19524, 17454, 16159, 15956, 19759, 21655, 20018, 6785, 8904, 4976, 6443, 8468, 6788, 5252, 5549)
## SELECT states.id, states.state, states.centroid_longitude, states.centroid_latitude, states.population_total, states.population_male_total, states.population_male_lt5, states.population_male_5to9, states.population_male_10to14, states.population_male_15to17, states.population_male_18to19, states.population_male_20, states.population_male_21, states.population_male_22to24, states.population_male_25to29, states.population_male_30to34, states.population_male_35to39, states.population_male_40to44, states.population_male_45to49, states.population_male_50to54, states.population_male_55to59, states.population_male_60to61, states.population_male_62to64, states.population_male_65to66, states.population_male_67to69, states.population_male_70to74, states.population_male_75to79, states.population_male_80to84, states.population_male_ge85, states.population_female_total, states.population_female_lt5, states.population_female_5to9, states.population_female_10to14, states.population_female_15to17, states.population_female_18to19, states.population_female_20, states.population_female_21, states.population_female_22to24, states.population_female_25to29, states.population_female_30to34, states.population_female_35to39, states.population_female_40to44, states.population_female_45to49, states.population_female_50to54, states.population_female_55to59, states.population_female_60to61, states.population_female_62to64, states.population_female_65to66, states.population_female_67to69, states.population_female_70to74, states.population_female_75to79, states.population_female_80to84, states.population_female_ge85
## FROM states
## (1, 'Wyoming', -107.5419255, 42.9918024, 563626, 287437, 20596, 19203, 18592, 11385, 8241, 4406, 4211, 12698, 21752, 18919, 17702, 17149, 19713, 22450, 20928, 7338, 9540, 5058, 6497, 8126, 5704, 4176, 3053, 276189, 19607, 18010, 17363, 10646, 7870, 3971, 3763, 11269, 19524, 17454, 16159, 15956, 19759, 21655, 20018, 6785, 8904, 4976, 6443, 8468, 6788, 5252, 5549)
## 1
## Wyoming
## (52, 53)
## (2392, 6)
## id state gender_age pop2010 gender age
## 0 1 Wyoming population_male_lt5 20596 male lt5
## 1 2 Pennsylvania population_male_lt5 373216 male lt5
## 2 3 Ohio population_male_lt5 367479 male lt5
## 3 4 New Mexico population_male_lt5 74078 male lt5
## 4 5 Maryland population_male_lt5 185916 male lt5
## 5 6 Rhode Island population_male_lt5 29396 male lt5
## 6 7 Oregon population_male_lt5 121828 male lt5
## 7 8 Puerto Rico population_male_lt5 115173 male lt5
## 8 9 Wisconsin population_male_lt5 183391 male lt5
## 9 10 North Dakota population_male_lt5 22821 male lt5
## id state gender_age pop2010 gender age
## 2382 43 Iowa population_female_ge85 51307 female ge85
## 2383 44 Arizona population_female_ge85 65662 female ge85
## 2384 45 Minnesota population_female_ge85 72357 female ge85
## 2385 46 Louisiana population_female_ge85 44789 female ge85
## 2386 47 District of Columbia population_female_ge85 7198 female ge85
## 2387 48 Virginia population_female_ge85 83957 female ge85
## 2388 49 Texas population_female_ge85 204208 female ge85
## 2389 50 Vermont population_female_ge85 8694 female ge85
## 2390 51 Maine population_female_ge85 19797 female ge85
## 2391 52 North Carolina population_female_ge85 103205 female ge85
## male 1196
## female 1196
## Name: gender, dtype: int64
## 10to14 104
## 15to17 104
## 20 104
## 5to9 104
## 45to49 104
## 67to69 104
## 21 104
## 70to74 104
## 55to59 104
## 50to54 104
## 60to61 104
## lt5 104
## ge85 104
## 40to44 104
## 30to34 104
## 62to64 104
## 25to29 104
## 75to79 104
## 65to66 104
## 80to84 104
## 22to24 104
## 35to39 104
## 18to19 104
## Name: age, dtype: int64
## <class 'pandas.core.frame.DataFrame'>
## RangeIndex: 2392 entries, 0 to 2391
## Data columns (total 6 columns):
## id 2392 non-null int64
## state 2392 non-null object
## gender_age 2392 non-null object
## pop2010 2392 non-null int64
## gender 2392 non-null object
## age 2392 non-null object
## dtypes: int64(2), object(4)
## memory usage: 74.8+ KB
## None
Chapter 2 - Applying Filtering, Ordering, etc.
Filtering and Targeting Data - select subsets of records based on specified criteria:
Overview of Ordering - equivalent of the ORDER BY method of SQL:
Counting, Summing, and Grouping Data - much more efficient to run these using SQL rather than to grab all the data and run these in Python:
Visualize Data using pandas and matplotlib:
Example code includes:
myPath = "./PythonInputFiles/"
import pandas as pd
# Import create_engine function
from sqlalchemy import create_engine, MetaData, Table, select
# Create an engine to the census database
# engine = create_engine('postgresql+psycopg2://' + 'student:datacamp' + '@postgresql.csrrinzqubik.us-east-1.rds.amazonaws.com' + ':5432/census')
# Created dummy data with real state-gender-age-pop2010 and totally fake pop2000 = (0.90, 1.05) * pop2010
engine = create_engine("sqlite:///" + myPath + "PartialFakeCensusExample.db")
# Use the .table_names() method on the engine to print the table names
print(engine.table_names())
# Create a select query: stmt
metadata = MetaData()
census = Table("census", metadata, autoload=True, autoload_with=engine) # make sure this is set up
stmt = select([census])
# Add a where clause to filter the results to only those for New York
stmt = stmt.where(census.columns["state"] == "New York")
# Execute the query to retrieve all the data returned: results
# Execute the statement and fetch the results: results
connection = engine.connect() # Create connection to the engine defined above (not sure . . . )
results = connection.execute(stmt).fetchall()
# Loop over the results and print the age, sex (gender), and pop2008 (pop2010)
for result in results:
print(result.age, result.gender, result.pop2010)
states = ['New York', 'California', 'Texas']
# Create a query for the census table: stmt
stmt = select([census])
# Append a where clause to match all the states in_ the list states
stmt = stmt.where(census.columns.state.in_(states))
# Loop over the ResultProxy and print the state and its population in 2000
for x in connection.execute(stmt):
print(x.state, x.pop2000)
# Import and_
from sqlalchemy import and_
# Build a query for the census table: stmt
stmt = select([census])
# Append a where clause to select only non-male records from California using and_
stmt = stmt.where(
# The state of California with a non-male sex
and_(census.columns.state == "California",
census.columns.gender != "male"
)
)
# Loop over the ResultProxy printing the age and sex
for result in connection.execute(stmt):
print(result.age, result.gender)
# Build a query to select the state column: stmt
stmt = select([census.columns.state])
# Order stmt by the state column
stmt = stmt.order_by(census.columns.state)
# Execute the query and store the results: results
results = connection.execute(stmt).fetchall()
# Print the first 10 results
print(results[:10])
# Import desc
from sqlalchemy import desc
# Build a query to select the state column: stmt
stmt = select([census.columns.state])
# Order stmt by state in descending order: rev_stmt
rev_stmt = stmt.order_by(desc(census.columns.state))
# Execute the query and store the results: rev_results
rev_results = connection.execute(rev_stmt).fetchall()
# Print the first 10 rev_results
print(rev_results[:10])
# Build a query to select state and age: stmt
stmt = select([census.columns.state, census.columns.age])
# Append order by to ascend by state and descend by age
stmt = stmt.order_by(census.columns.state, desc(census.columns.age))
# Execute the statement and store all the records: results
results = connection.execute(stmt).fetchall()
# Print the first 20 results
print(results[:20])
from sqlalchemy import func
# Build a query to count the distinct states values: stmt
stmt = select([func.count(census.columns.state.distinct())])
# Execute the query and store the scalar result: distinct_state_count
distinct_state_count = connection.execute(stmt).scalar()
# Print the distinct_state_count
print(distinct_state_count)
# Import func
from sqlalchemy import func
# Build a query to select the state and count of ages by state: stmt
stmt = select([census.columns.state, func.count(census.columns.age)])
# Group stmt by state
stmt = stmt.group_by(census.columns.state)
# Execute the statement and store all the records: results
results = connection.execute(stmt).fetchall()
# Print results
print(results)
# Print the keys/column names of the results returned
print(results[0].keys())
# Import func
from sqlalchemy import func
# Build an expression to calculate the sum of pop2008 labeled as population
pop2010_sum = func.sum(census.columns.pop2010).label("population")
# Build a query to select the state and sum of pop2008: stmt
stmt = select([census.columns.state, pop2010_sum])
# Group stmt by state
stmt = stmt.group_by(census.columns.state)
# Execute the statement and store all the records: results
results = connection.execute(stmt).fetchall()
# Print results
print(results)
# Print the keys/column names of the results returned
print(results[0].keys())
# import pandas
import pandas as pd
# Create a DataFrame from the results: df
df = pd.DataFrame(results)
# Set column names
df.columns = results[0].keys()
# Print the Dataframe
print(df)
# Import Pyplot as plt from matplotlib
import matplotlib.pyplot as plt
# Plot the DataFrame
df.sort_values("population", ascending=False).set_index("state").plot.bar()
# plt.show()
plt.savefig("_dummyPy074.png", bbox_inches="tight")
plt.clf()
## ['census', 'state_fact']
## lt5 male 590879
## 5to9 male 594362
## 10to14 male 619243
## 15to17 male 406797
## 18to19 male 292751
## 20 male 149840
## 21 male 143298
## 22to24 male 418864
## 25to29 male 680203
## 30to34 male 629759
## 35to39 male 613775
## 40to44 male 663333
## 45to49 male 709523
## 50to54 male 687779
## 55to59 male 591847
## 60to61 male 214047
## 62to64 male 286312
## 65to66 male 151551
## 67to69 male 200704
## 70to74 male 258616
## 75to79 male 200049
## 80to84 male 150993
## ge85 male 122622
## lt5 female 564943
## 5to9 female 569593
## 10to14 female 592213
## 15to17 female 386899
## 18to19 female 279831
## 20 female 143243
## 21 female 138298
## 22to24 female 417392
## 25to29 female 699974
## 30to34 female 649401
## 35to39 female 640349
## 40to44 female 692560
## 45to49 female 749240
## 50to54 female 732149
## 55to59 female 645561
## 60to61 female 239946
## 62to64 female 325955
## 65to66 female 178609
## 67to69 female 242347
## 70to74 female 328775
## 75to79 female 274758
## 80to84 female 240667
## ge85 female 268252
## New York 567834
## California 1261704
## Texas 969386
## New York 569398
## California 1287240
## Texas 950364
## New York 601284
## California 1332544
## Texas 955163
## New York 424289
## California 880198
## Texas 602017
## New York 297435
## California 542407
## Texas 351896
## New York 145494
## California 282228
## Texas 199048
## New York 145590
## California 282458
## Texas 190191
## New York 427241
## California 815489
## Texas 504550
## New York 688365
## California 1411515
## Texas 861970
## New York 625980
## California 1246955
## Texas 912905
## New York 642622
## California 1203556
## Texas 889281
## New York 648739
## California 1234523
## Texas 803674
## New York 744289
## California 1366139
## Texas 814497
## New York 655453
## California 1305805
## Texas 767493
## New York 543907
## California 1114914
## Texas 630442
## New York 208909
## California 345910
## Texas 242777
## New York 297764
## California 474567
## Texas 298426
## New York 141397
## California 281418
## Texas 168120
## New York 209334
## California 355970
## Texas 229053
## New York 251116
## California 423892
## Texas 257465
## New York 200849
## California 337174
## Texas 212283
## New York 142990
## California 230677
## Texas 141253
## New York 121273
## California 220723
## Texas 103495
## New York 526526
## California 1143243
## Texas 960377
## New York 528012
## California 1210334
## Texas 957641
## New York 567340
## California 1228329
## Texas 908907
## New York 403535
## California 845514
## Texas 523162
## New York 254926
## California 533823
## Texas 333994
## New York 132643
## California 268662
## Texas 174983
## New York 125297
## California 258666
## Texas 166167
## New York 376070
## California 754520
## Texas 486828
## New York 655175
## California 1378453
## Texas 906760
## New York 636412
## California 1305925
## Texas 860873
## New York 668524
## California 1205276
## Texas 827988
## New York 632999
## California 1343752
## Texas 867432
## New York 696043
## California 1401748
## Texas 836895
## New York 705059
## California 1169000
## Texas 838466
## New York 583587
## California 1175155
## Texas 678970
## New York 235147
## California 424561
## Texas 236343
## New York 339971
## California 509927
## Texas 317358
## New York 172357
## California 284615
## Texas 195515
## New York 240165
## California 386112
## Texas 265604
## New York 317596
## California 488747
## Texas 339314
## New York 281901
## California 444839
## Texas 268446
## New York 245721
## California 333861
## Texas 216712
## New York 280055
## California 395224
## Texas 189300
## lt5 female
## 5to9 female
## 10to14 female
## 15to17 female
## 18to19 female
## 20 female
## 21 female
## 22to24 female
## 25to29 female
## 30to34 female
## 35to39 female
## 40to44 female
## 45to49 female
## 50to54 female
## 55to59 female
## 60to61 female
## 62to64 female
## 65to66 female
## 67to69 female
## 70to74 female
## 75to79 female
## 80to84 female
## ge85 female
## [('Alabama',), ('Alabama',), ('Alabama',), ('Alabama',), ('Alabama',), ('Alabama',), ('Alabama',), ('Alabama',), ('Alabama',), ('Alabama',)]
## [('Wyoming',), ('Wyoming',), ('Wyoming',), ('Wyoming',), ('Wyoming',), ('Wyoming',), ('Wyoming',), ('Wyoming',), ('Wyoming',), ('Wyoming',)]
## [('Alabama', 'lt5'), ('Alabama', 'lt5'), ('Alabama', 'ge85'), ('Alabama', 'ge85'), ('Alabama', '80to84'), ('Alabama', '80to84'), ('Alabama', '75to79'), ('Alabama', '75to79'), ('Alabama', '70to74'), ('Alabama', '70to74'), ('Alabama', '67to69'), ('Alabama', '67to69'), ('Alabama', '65to66'), ('Alabama', '65to66'), ('Alabama', '62to64'), ('Alabama', '62to64'), ('Alabama', '60to61'), ('Alabama', '60to61'), ('Alabama', '5to9'), ('Alabama', '5to9')]
## 52
## [('Alabama', 46), ('Alaska', 46), ('Arizona', 46), ('Arkansas', 46), ('California', 46), ('Colorado', 46), ('Connecticut', 46), ('Delaware', 46), ('District of Columbia', 46), ('Florida', 46), ('Georgia', 46), ('Hawaii', 46), ('Idaho', 46), ('Illinois', 46), ('Indiana', 46), ('Iowa', 46), ('Kansas', 46), ('Kentucky', 46), ('Louisiana', 46), ('Maine', 46), ('Maryland', 46), ('Massachusetts', 46), ('Michigan', 46), ('Minnesota', 46), ('Mississippi', 46), ('Missouri', 46), ('Montana', 46), ('Nebraska', 46), ('Nevada', 46), ('New Hampshire', 46), ('New Jersey', 46), ('New Mexico', 46), ('New York', 46), ('North Carolina', 46), ('North Dakota', 46), ('Ohio', 46), ('Oklahoma', 46), ('Oregon', 46), ('Pennsylvania', 46), ('Puerto Rico', 46), ('Rhode Island', 46), ('South Carolina', 46), ('South Dakota', 46), ('Tennessee', 46), ('Texas', 46), ('Utah', 46), ('Vermont', 46), ('Virginia', 46), ('Washington', 46), ('West Virginia', 46), ('Wisconsin', 46), ('Wyoming', 46)]
## ['state', 'count_1']
## [('Alabama', 4779736), ('Alaska', 710231), ('Arizona', 6392017), ('Arkansas', 2915918), ('California', 37253956), ('Colorado', 5029196), ('Connecticut', 3574097), ('Delaware', 897934), ('District of Columbia', 601723), ('Florida', 18801310), ('Georgia', 9687653), ('Hawaii', 1360301), ('Idaho', 1567582), ('Illinois', 12830632), ('Indiana', 6483802), ('Iowa', 3046355), ('Kansas', 2853118), ('Kentucky', 4339367), ('Louisiana', 4533372), ('Maine', 1328361), ('Maryland', 5773552), ('Massachusetts', 6547629), ('Michigan', 9883640), ('Minnesota', 5303925), ('Mississippi', 2967297), ('Missouri', 5988927), ('Montana', 989415), ('Nebraska', 1826341), ('Nevada', 2700551), ('New Hampshire', 1316470), ('New Jersey', 8791894), ('New Mexico', 2059179), ('New York', 19378102), ('North Carolina', 9535483), ('North Dakota', 672591), ('Ohio', 11536504), ('Oklahoma', 3751351), ('Oregon', 3831074), ('Pennsylvania', 12702379), ('Puerto Rico', 3725789), ('Rhode Island', 1052567), ('South Carolina', 4625364), ('South Dakota', 814180), ('Tennessee', 6346105), ('Texas', 25145561), ('Utah', 2763885), ('Vermont', 625741), ('Virginia', 8001024), ('Washington', 6724540), ('West Virginia', 1852994), ('Wisconsin', 5686986), ('Wyoming', 563626)]
## ['state', 'population']
## state population
## 0 Alabama 4779736
## 1 Alaska 710231
## 2 Arizona 6392017
## 3 Arkansas 2915918
## 4 California 37253956
## 5 Colorado 5029196
## 6 Connecticut 3574097
## 7 Delaware 897934
## 8 District of Columbia 601723
## 9 Florida 18801310
## 10 Georgia 9687653
## 11 Hawaii 1360301
## 12 Idaho 1567582
## 13 Illinois 12830632
## 14 Indiana 6483802
## 15 Iowa 3046355
## 16 Kansas 2853118
## 17 Kentucky 4339367
## 18 Louisiana 4533372
## 19 Maine 1328361
## 20 Maryland 5773552
## 21 Massachusetts 6547629
## 22 Michigan 9883640
## 23 Minnesota 5303925
## 24 Mississippi 2967297
## 25 Missouri 5988927
## 26 Montana 989415
## 27 Nebraska 1826341
## 28 Nevada 2700551
## 29 New Hampshire 1316470
## 30 New Jersey 8791894
## 31 New Mexico 2059179
## 32 New York 19378102
## 33 North Carolina 9535483
## 34 North Dakota 672591
## 35 Ohio 11536504
## 36 Oklahoma 3751351
## 37 Oregon 3831074
## 38 Pennsylvania 12702379
## 39 Puerto Rico 3725789
## 40 Rhode Island 1052567
## 41 South Carolina 4625364
## 42 South Dakota 814180
## 43 Tennessee 6346105
## 44 Texas 25145561
## 45 Utah 2763885
## 46 Vermont 625741
## 47 Virginia 8001024
## 48 Washington 6724540
## 49 West Virginia 1852994
## 50 Wisconsin 5686986
## 51 Wyoming 563626
Population (2010) by State:
Chapter 3 - Advanced SQL Alchemy Queries
Calculating Values in a Query - addition, subtraction, multiplication, and the like:
SQL Relationships - bridging data that appears in multiple SQL tables:
Working with Hierarchical Tables (self-referential tables) - tables that refer to themselves:
Dealing with large ResultSets - running out of memory or disk space or the like:
Example code includes:
myPath = "./PythonInputFiles/"
import pandas as pd
# Import sqlalchemy functions
from sqlalchemy import create_engine, MetaData, Table, select, func, desc
# Create an engine to the census database
# engine = create_engine('mysql+pymysql://' + 'student:datacamp' + '@courses.csrrinzqubik.us-east-1.rds.amazonaws.com:3306/' + 'census')
# Created dummy data with real state-gender-age-pop2010 and totally fake pop2000 = (0.90, 1.05) * pop2010
engine = create_engine("sqlite:///" + myPath + "PartialFakeCensusExample.db")
# Print the table names
print(engine.table_names())
# General pre-amble to be able to access "census"
metadata = MetaData()
census = Table("census", metadata, autoload=True, autoload_with=engine) # make sure this is set up
state_fact = Table("state_fact", metadata, autoload=True, autoload_with=engine) # make sure this is set up
# Build query to return state names by population difference from 2008 (make 2010) to 2000: stmt
stmt = select([census.columns.state, (census.columns.pop2010 - census.columns.pop2000).label("pop_change")])
# Append group by for the state: stmt
stmt = stmt.group_by(census.columns.state)
# Append order by for pop_change descendingly: stmt
stmt = stmt.order_by(desc("pop_change"))
# Return only 5 results: stmt
stmt = stmt.limit(5)
# Use connection to execute the statement and fetch all results
connection = engine.connect() # Create connection to the engine defined above (not sure . . . )
results = connection.execute(stmt).fetchall()
# Print the state and population change for each record
for result in results:
print('{}:{}'.format(result.state, result.pop_change))
# import case, cast and Float from sqlalchemy
from sqlalchemy import case, cast, Float
# Build an expression to calculate female population in 2000
female_pop2000 = func.sum(
case([
(census.columns.gender == "female", census.columns.pop2000)
], else_=0))
# Cast an expression to calculate total population in 2000 to Float
total_pop2000 = cast(func.sum(census.columns.pop2000), Float)
# Build a query to calculate the percentage of females in 2000: stmt
stmt = select([female_pop2000 / total_pop2000* 100])
# Execute the query and store the scalar result: percent_female
percent_female = connection.execute(stmt).scalar()
# Print the percentage
print(percent_female)
# Build a statement to join census and state_fact tables: stmt
stmt = select([census.columns.pop2000, state_fact.columns.abbreviation])
# Execute the statement and get the first result: result
result = connection.execute(stmt).first()
# Loop over the keys in the result object and print the key and value
for key in result.keys():
print(key, getattr(result, key))
# Build a statement to select the census and state_fact tables: stmt
stmt = select([census, state_fact])
# Add a select_from clause that wraps a join for the census and state_fact
# tables where the census state column and state_fact name column match
stmt = stmt.select_from(
(census.join(state_fact, census.columns.state == state_fact.columns.name)))
# Execute the statement and get the first result: result
result = connection.execute(stmt).first()
# Loop over the keys in the result object and print the key and value
for key in result.keys():
print(key, getattr(result, key))
# Build a statement to select the state, sum of 2008 (using 2010 instead) population and census
# division name: stmt
stmt = select([
census.columns.state,
func.sum(census.columns.pop2010),
state_fact.columns.census_division_name
])
# Append select_from to join the census and state_fact tables by the census state and state_fact name columns
stmt = stmt.select_from(
census.join(state_fact, census.columns.state == state_fact.columns.name)
)
# Append a group by for the state_fact name column
stmt = stmt.group_by(state_fact.columns.name)
# Execute the statement and get the results: results
results = connection.execute(stmt).fetchall()
# Loop over the the results object and print each record.
for record in results:
print(record)
# Make an alias of the employees table: managers
# managers = employees.alias()
# Build a query to select manager's and their employees names: stmt
# stmt = select(
# [managers.columns.name.label('manager'),
# employees.columns.name.label("employee")]
# )
# Match managers id with employees mgr: stmt
# stmt = stmt.where(managers.columns.id == employees.columns.mgr)
# Order the statement by the managers name: stmt
# stmt = stmt.order_by(managers.columns.name)
# Execute statement: results
# results = connection.execute(stmt).fetchall()
# Print records
# for record in results:
# print(record)
# Make an alias of the employees table: managers
# managers = employees.alias()
# Build a query to select managers and counts of their employees: stmt
# stmt = select([managers.columns.name, func.count(employees.columns.id)])
# Append a where clause that ensures the manager id and employee mgr are equal
# stmt = stmt.where(managers.columns.id == employees.columns.mgr)
# Group by Managers Name
# stmt = stmt.group_by(managers.columns.name)
# Execute statement: results
# results = connection.execute(stmt).fetchall()
# print manager
# for record in results:
# print(record)
# Start a while loop checking for more results
# while more_results:
# Fetch the first 50 results from the ResultProxy: partial_results
# partial_results = results_proxy.fetchmany(50)
# if empty list, set more_results to False
# if partial_results == []:
# more_results = False
# Loop over the fetched records and increment the count for the state
# for row in partial_results:
# if row.state in state_count:
# state_count[row.state] += 1
# else:
# state_count[row.state] = 1
# Close the ResultProxy, and thus the connection
# results_proxy.close()
# Print the count by state
# print(state_count)
## ['census', 'state_fact']
## Florida:22065
## Illinois:15716
## Texas:14908
## Indiana:6848
## Massachusetts:6111
## 50.85769837165718
## pop2000 21543
## abbreviation AL
## id 1
## state Wyoming
## gender male
## age lt5
## pop2000 21543
## pop2010 20596
## name Wyoming
## abbreviation WY
## census_division_name 8 (West / Mountain)
## ('Alabama', 4779736, '6 (South / East South Central)')
## ('Alaska', 710231, '9 (West / Pacific)')
## ('Arizona', 6392017, '8 (West / Mountain)')
## ('Arkansas', 2915918, '7 (South / West South Central)')
## ('California', 37253956, '9 (West / Pacific)')
## ('Colorado', 5029196, '8 (West / Mountain)')
## ('Connecticut', 3574097, '1 (Northeast / New England)')
## ('Delaware', 897934, '5 (South / South Atlantic)')
## ('District of Columbia', 601723, '5 (South / South Atlantic)')
## ('Florida', 18801310, '5 (South / South Atlantic)')
## ('Georgia', 9687653, '5 (South / South Atlantic)')
## ('Hawaii', 1360301, '9 (West / Pacific)')
## ('Idaho', 1567582, '8 (West / Mountain)')
## ('Illinois', 12830632, '3 (Midwest / East North Central)')
## ('Indiana', 6483802, '3 (Midwest / East North Central)')
## ('Iowa', 3046355, '4 (Midwest / West North Central)')
## ('Kansas', 2853118, '4 (Midwest / West North Central)')
## ('Kentucky', 4339367, '6 (South / East South Central)')
## ('Louisiana', 4533372, '7 (South / West South Central)')
## ('Maine', 1328361, '1 (Northeast / New England)')
## ('Maryland', 5773552, '5 (South / South Atlantic)')
## ('Massachusetts', 6547629, '1 (Northeast / New England)')
## ('Michigan', 9883640, '3 (Midwest / East North Central)')
## ('Minnesota', 5303925, '4 (Midwest / West North Central)')
## ('Mississippi', 2967297, '6 (South / East South Central)')
## ('Missouri', 5988927, '4 (Midwest / West North Central)')
## ('Montana', 989415, '8 (West / Mountain)')
## ('Nebraska', 1826341, '4 (Midwest / West North Central)')
## ('Nevada', 2700551, '8 (West / Mountain)')
## ('New Hampshire', 1316470, '1 (Northeast / New England)')
## ('New Jersey', 8791894, '2 (Northeast / Mid-Atlantic)')
## ('New Mexico', 2059179, '8 (West / Mountain)')
## ('New York', 19378102, '2 (Northeast / Mid-Atlantic)')
## ('North Carolina', 9535483, '5 (South / South Atlantic)')
## ('North Dakota', 672591, '4 (Midwest / West North Central)')
## ('Ohio', 11536504, '3 (Midwest / East North Central)')
## ('Oklahoma', 3751351, '7 (South / West South Central)')
## ('Oregon', 3831074, '9 (West / Pacific)')
## ('Pennsylvania', 12702379, '2 (Northeast / Mid-Atlantic)')
## ('Puerto Rico', 3725789, '0 (None)')
## ('Rhode Island', 1052567, '1 (Northeast / New England)')
## ('South Carolina', 4625364, '5 (South / South Atlantic)')
## ('South Dakota', 814180, '4 (Midwest / West North Central)')
## ('Tennessee', 6346105, '6 (South / East South Central)')
## ('Texas', 25145561, '7 (South / West South Central)')
## ('Utah', 2763885, '8 (West / Mountain)')
## ('Vermont', 625741, '1 (Northeast / New England)')
## ('Virginia', 8001024, '5 (South / South Atlantic)')
## ('Washington', 6724540, '9 (West / Pacific)')
## ('West Virginia', 1852994, '5 (South / South Atlantic)')
## ('Wisconsin', 5686986, '3 (Midwest / East North Central)')
## ('Wyoming', 563626, '8 (West / Mountain)')
Chapter 4 - Creating and Manipulating Databases
Creating Databases and Tables - different by database types, and outside the scope of this course:
Inserting Data into a Table - done with the insert() command:
Updating Data in a Database - done with the update() statement, like an insert() statement but with a where clause:
Removing Data from a Database - done with the delete() statement - BE CAREFUL!:
Example code includes:
myPath = "./PythonInputFiles/"
import pandas as pd
# Import sqlalchemy functions
from sqlalchemy import create_engine, MetaData, Table, select, func, desc
# Import Table, Column, String, Integer, Float, Boolean from sqlalchemy
from sqlalchemy import Table, Column, String, Integer, Float, Boolean
# Set up for a new FAKE database
engine = create_engine("sqlite:///" + myPath + "_notuse_CreatedFake.db")
print(engine.table_names())
metadata = MetaData()
# Define a new table with a name, count, amount, and valid column: data
data = Table('data', metadata,
Column("name", String(255)),
Column('count', Integer()),
Column("amount", Float()),
Column("valid", Boolean())
)
# Use the metadata to create the table
metadata.create_all(engine)
# Print table details
print(repr(data))
# Define a new table with a name, count, amount, and valid column: data
data02 = Table('data02', metadata,
Column('name', String(255), unique=True),
Column('count', Integer(), default=1),
Column('amount', Float()),
Column('valid', Boolean(), default=False)
)
# Use the metadata to create the table
metadata.create_all(engine)
# Print the table details
print(repr(metadata.tables['data02']))
# Import insert and select from sqlalchemy
from sqlalchemy import insert
# Build an insert statement to insert a record into the data table: stmt
stmt = insert(data02).values(name="Anna", count=1, amount=1000.00, valid=True)
# Execute the statement via the connection: results
connection = engine.connect()
results = connection.execute(stmt)
# Print result rowcount
print(results.rowcount)
# Build a select statement to validate the insert
stmt = select([data02]).where(data02.columns.name == "Anna")
# Print the result of executing the query.
print(connection.execute(stmt).first())
# Delete the row so the table is empty again
stmt = "DELETE FROM data02" # Since there is no WHERE, this will delete everything
results = connection.execute(stmt)
print(results.rowcount)
# Build a list of dictionaries: values_list
values_list = [
{'name': "Anna", 'count': 1, 'amount': 1000.00, 'valid': True},
{'name': "Taylor", 'count': 1, 'amount': 750.00, 'valid': False}
]
# Build an insert statement for the data table: stmt
stmt = insert(data02)
# Execute stmt with the values_list: results
results = connection.execute(stmt, values_list)
# Print rowcount
print(results.rowcount)
# Place census data in the fake DB
census = Table('census', metadata,
Column('state', String(255)),
Column('gender', String(6)),
Column('age', String(255)),
Column('pop2000', Integer()),
Column('pop2010', Integer())
)
metadata.create_all(engine)
print(repr(data))
# Create a insert statement for census: stmt
stmt = insert(census)
# Create an empty list and zeroed row count: values_list, total_rowcount
values_list = []
total_rowcount = 0
# Enumerate the rows of csv_reader
for idx, row in enumerate(open(myPath + "_notuse_census2000.csv", "r")):
if idx == 0 :
print("Headers are: ", row)
continue
# Headers for this file are id,state,gender,age,pop2000,pop2010
rowItems = row.split(",")
data = {'state': rowItems[1], 'gender': rowItems[2], 'age': rowItems[3], 'pop2000': int(rowItems[4]),
'pop2010': int(rowItems[5])}
values_list.append(data)
# Check to see if divisible by 51
if idx % 51 == 0:
results = connection.execute(stmt, values_list)
total_rowcount += results.rowcount
values_list = []
# Print total rowcount
print(total_rowcount)
# Place state_fact data in the fake DB
state_fact = Table('state_fact', metadata,
Column('name', String(255)),
Column('abbreviation', String(2)),
Column('census_division_name', String(255)),
Column('fips_state', Integer(), default=0),
Column('notes', String(255), default="none")
)
metadata.create_all(engine)
print(repr(state_fact))
# Read CSV for state facts
stateFact = pd.read_csv(myPath + "_notuse_stateFact.csv")
values_list = []
for x in range(stateFact.shape[0]):
y = stateFact.iloc[x, :]
values_list.append( { "name":y["name"], "abbreviation":y["abbreviation"], "census_division_name":y["census_division_name"] })
# Create the table
stmt = insert(state_fact)
results = connection.execute(stmt, values_list)
# Build a select statement: select_stmt
select_stmt = select([state_fact]).where(state_fact.columns.name == "New York")
# Print the results of executing the select_stmt
print(connection.execute(select_stmt).fetchall())
# Build a statement to update the fips_state to 36: stmt
from sqlalchemy import update
stmt = update(state_fact).values(fips_state = 36)
# Append a where clause to limit it to records for New York state
stmt = stmt.where(state_fact.columns.name == "New York")
# Execute the statement: results
results = connection.execute(stmt)
# Print rowcount
print(results.rowcount)
# Execute the select_stmt again to view the changes
print(connection.execute(select_stmt).fetchall())
# Build a statement to update the notes to 'The Wild West': stmt
stmt = update(state_fact).values(notes = "The Wild West")
# Append a where clause to match the West census region records
stmt = stmt.where(state_fact.columns.census_division_name == "8 (West / Mountain)")
# Execute the statement: results
results = connection.execute(stmt)
# Print rowcount
print(results.rowcount)
# Build a statement to select name from state_fact: stmt
# fips_stmt = select([state_fact.columns.name])
# Append a where clause to Match the fips_state to flat_census fips_code
# fips_stmt = fips_stmt.where(
# state_fact.columns.fips_state == flat_census.columns.fips_code)
# Build an update statement to set the name to fips_stmt: update_stmt
# update_stmt = update(flat_census).values(state_name=fips_stmt)
# Execute update_stmt: results
# results = connection.execute(update_stmt)
# Print rowcount
# print(results.rowcount)
# Import delete, select
from sqlalchemy import delete, select
# Build a statement to empty the census table: stmt
stmt = delete(census)
# Execute the statement: results
results = connection.execute(stmt)
# Print affected rowcount
print(results.rowcount)
# Build a statement to select all records from the census table
stmt = select([census])
# Print the results of executing the statement to verify there are no rows
print(connection.execute(stmt).fetchall())
# Build a statement to count records using the sex column for Men ('M') age 36: stmt
# stmt = select([func.count(census.columns.sex)]).where(
# and_(census.columns.sex == 'M',
# census.columns.age == 36)
# )
# Execute the select statement and use the scalar() fetch method to save the record count
# to_delete = connection.execute(stmt).scalar()
# Build a statement to delete records from the census table: stmt_del
# stmt_del = delete(census)
# Append a where clause to target Men ('M') age 36
# stmt_del = stmt_del.where(
# and_(census.columns.sex == "M",
# census.columns.age == 36)
# )
# Execute the statement: results
# results = connection.execute(stmt_del)
# Print affected rowcount and to_delete record count, make sure they match
# print(results.rowcount, to_delete)
# Drop the state_fact table
state_fact.drop(engine)
# Check to see if state_fact exists
print(state_fact.exists(engine))
# Drop all tables
metadata.drop_all(engine)
# Check to see if census exists
print(census.exists(engine))
# Get rid of all tables in the database
metadata.drop_all(engine)
connection.close()
## []
## Table('data', MetaData(bind=None), Column('name', String(length=255), table=<data>), Column('count', Integer(), table=<data>), Column('amount', Float(), table=<data>), Column('valid', Boolean(), table=<data>), schema=None)
## Table('data02', MetaData(bind=None), Column('name', String(length=255), table=<data02>), Column('count', Integer(), table=<data02>, default=ColumnDefault(1)), Column('amount', Float(), table=<data02>), Column('valid', Boolean(), table=<data02>, default=ColumnDefault(False)), schema=None)
## 1
## ('Anna', 1, 1000.0, True)
## 1
## 2
## Table('data', MetaData(bind=None), Column('name', String(length=255), table=<data>), Column('count', Integer(), table=<data>), Column('amount', Float(), table=<data>), Column('valid', Boolean(), table=<data>), schema=None)
## Headers are: id,state,gender,age,pop2000,pop2010
##
## 2346
## Table('state_fact', MetaData(bind=None), Column('name', String(length=255), table=<state_fact>), Column('abbreviation', String(length=2), table=<state_fact>), Column('census_division_name', String(length=255), table=<state_fact>), Column('fips_state', Integer(), table=<state_fact>, default=ColumnDefault(0)), Column('notes', String(length=255), table=<state_fact>, default=ColumnDefault('none')), schema=None)
## [('New York', 'NY', '2 (Northeast / Mid-Atlantic)', 0, 'none')]
## 1
## [('New York', 'NY', '2 (Northeast / Mid-Atlantic)', 36, 'none')]
## 8
## 2346
## []
## False
## False
Chapter 5 - Case Study
Census Case Study - three components:
Populating the Database - using CSV file from the Census:
Example Queries:
Example code includes:
myPath = "./PythonInputFiles/"
import pandas as pd
# Import sqlalchemy functions
from sqlalchemy import create_engine, MetaData, Table, select, func, desc
from sqlalchemy import Table, Column, String, Integer, Float, Boolean
# Define an engine to connect to chapter5.sqlite: engine
engine = create_engine('sqlite:///' + myPath + 'chapter5.sqlite')
# Initialize MetaData: metadata
metadata = MetaData()
# Build a census table: census
census = Table('census', metadata,
Column('state', String(30)),
Column("gender", String(6)),
Column("age", Float()),
Column("pop2000", Integer()),
Column("pop2010", Integer()),
Column("ageText", String(30))
)
# Create the table in the database
metadata.create_all(engine)
# Create mapping of text ages to numeric ages
import numpy as np
tmpAge = list(pd.read_csv(myPath + "_notuse_census2000.csv")["age"].unique())
tmpNum = [np.mean([int(x.split("to")[0]), int(x.split("to")[1])]) if x.find("to") > -1 else 0 for x in tmpAge]
tmpNum[tmpAge.index("20")] = 20
tmpNum[tmpAge.index("21")] = 21
tmpNum[tmpAge.index("lt5")] = 2.5
tmpNum[tmpAge.index("ge85")] = 90
# Create an empty list: values_list
values_list = []
# Iterate over the rows
for idx, row in enumerate(open(myPath + "_notuse_census2000.csv", "r")):
if idx == 0 :
print("Headers are: ", row)
continue
# Create a dictionary with the values
rowItems = row.split(",")
ageNum = tmpNum[tmpAge.index(rowItems[3])]
data = {'state': rowItems[1], 'gender': rowItems[2], 'age': ageNum, 'pop2000': int(rowItems[4]),
'pop2010': int(rowItems[5]), 'ageText':rowItems[3]}
values_list.append(data)
# Import insert
from sqlalchemy import insert
# Build insert statement: stmt
stmt = insert(census)
# Use values_list to insert data: results
connection = engine.connect()
results = connection.execute(stmt, values_list)
# Print rowcount
print(results.rowcount)
# Import select
from sqlalchemy import select
# Calculate weighted average age: stmt
stmt = select([census.columns.gender,
(func.sum(census.columns.age * census.columns.pop2010) /
func.sum(census.columns.pop2010)).label("average_age")
])
# Group by sex
stmt = stmt.group_by(census.columns.gender)
# Execute the query and store the results: results
results = connection.execute(stmt).fetchall()
# Print the average age by sex
for x in results:
print(x[0], x[1])
# import case, cast and Float from sqlalchemy
from sqlalchemy import case, cast, Float
# Build a query to calculate the percentage of females in 2010: stmt
stmt = select([census.columns.state,
(func.sum(
case([
(census.columns.gender == 'female', census.columns.pop2010)
], else_=0)) /
cast(func.sum(census.columns.pop2010), Float) * 100).label('percent_female')
])
# Group By state
stmt = stmt.group_by(census.columns.state)
# Execute the query and store the results: results
results = connection.execute(stmt).fetchall()
# Plot the results by state
import matplotlib.pyplot as plt
pctFemale = [y for x, y in results]
pctState = [x for x, y in results]
myDF = pd.DataFrame( {"% female":pd.to_numeric(pctFemale)}, index=pctState )
myDF.sort_values("% female", ascending=False).plot(kind="bar", ylim=(46, 54))
plt.title("% Female by State (2010 Census)")
# plt.show()
plt.savefig("_dummyPy075.png", bbox_inches="tight")
plt.clf()
# Print the percentage
# for result in results:
# print(result.state, result.percent_female)
# Build query to return state name and population difference from 2008 to 2000
stmt = select([census.columns.state,
(census.columns.pop2010 - census.columns.pop2000).label('pop_change')
])
# Group by State
stmt = stmt.group_by(census.columns.state)
# Order by Population Change
stmt = stmt.order_by(desc("pop_change"))
# Limit to top 10
stmt = stmt.limit(10)
# Use connection to execute the statement and fetch all results
results = connection.execute(stmt).fetchall()
# Print the state and population change for each record
for result in results:
print('{}:{}'.format(result.state, result.pop_change))
# Calculate average age by state (2010)
stmt = select([census.columns.state,
(func.sum(census.columns.age * census.columns.pop2010) /
func.sum(census.columns.pop2010)).label("average_age")
])
# Group by sex
stmt = stmt.group_by(census.columns.state)
# Execute the query and store the results: results
results = connection.execute(stmt).fetchall()
myDF2 = pd.DataFrame( {"Avg. Age":pd.to_numeric([y for x, y in results])}, index=[x for x, y in results] )
myDF2.sort_values("Avg. Age", ascending=False).plot(kind="bar", ylim=(30, 45))
plt.title("Average Age by State (2010 Census)")
# plt.show()
plt.savefig("_dummyPy076.png", bbox_inches="tight")
plt.clf()
# Delete the DB
# Get rid of all tables in the database
metadata.drop_all(engine)
connection.close()
## C:\Users\Dave\AppData\Local\Programs\Python\PYTHON~1\lib\site-packages\sqlalchemy\sql\sqltypes.py:596: SAWarning: Dialect sqlite+pysqlite does *not* support Decimal objects natively, and SQLAlchemy must convert from floating point - rounding errors and other issues may occur. Please consider storing Decimal numbers as strings or integers on this platform for lossless storage.
## 'storage.' % (dialect.name, dialect.driver))
## Headers are: id,state,gender,age,pop2000,pop2010
##
## 2392
## female 38.46474229575023
## male 36.27156086981655
## Florida:22065
## Illinois:15716
## Texas:14908
## Indiana:6848
## Massachusetts:6111
## Virginia:5374
## Tennessee:5102
## Connecticut:4984
## Louisiana:4345
## North Carolina:3406
% Female (2010 Census) by State:
Average Age (2010 Census) by State:
Chapter 1 - Fundamental Data Types
Introduction and lists - “container sequences” hold other types of data:
Tuples - somewhat like a list in how they hold data, but with key differences:
Sets for unordered and unique data - excellent for finding all the unique values:
Example code includes:
myPath = "./PythonInputFiles/"
# Create a list containing the names: baby_names
baby_names = ['Ximena', 'Aliza', 'Ayden', 'Calvin']
# Extend baby_names with 'Rowen' and 'Sandeep'
baby_names.extend(['Rowen', 'Sandeep'])
# Print baby_names
print(baby_names)
# Find the position of 'Aliza': position
position = baby_names.index("Aliza")
# Remove 'Aliza' from baby_names
baby_names.pop(position)
# Print baby_names
print(baby_names)
# A list of lists, records has been pre-loaded. If you explore it in the IPython Shell, you'll see that each entry is a list of this form: ['2011', 'FEMALE', 'HISPANIC', 'GERALDINE', '13', '75']
# Dummy up something similar from the SSA data
import pandas as pd
pd2011 = pd.read_csv(myPath + "yob2011.txt", header=None, names=["Name", "Gender", "Count"])
# Speed the processing - keep only names with Count >= 5000
records2011 = []
for idx in pd2011.loc[pd2011["Count"] >= 5000].index:
rowData = pd2011.loc[idx]
newList = ["2011", rowData["Gender"], "NA", rowData["Name"], "NA", rowData["Count"]]
records2011.append(newList)
# Create the empty list: baby_names
baby_names = []
# Loop over a list of records
for row in records2011:
# Add the name found in column 3 to the list
baby_names.append(row[3])
# Sort the names in alphabetical order
for name in sorted(baby_names):
# Print each name
print(name)
girl_names = ['GRACE', 'Victoria', 'Rachel', 'Anna', 'Samantha', 'Kayla', 'Claire', 'Ashley', 'Zoe', 'Alina', 'Angela', 'Olivia', 'AVA', 'Valentina', 'CAMILA', 'Miriam', 'MADISON', 'Aaliyah', 'RACHEL', 'Serenity', 'EMILY', 'Mia', 'Chloe', 'MIA', 'LONDON', 'Chana', 'TAYLOR', 'CHLOE', 'FIONA', 'Camila', 'GABRIELLE', 'SOPHIA', 'CHANA', 'LEAH', 'ELLA', 'GENESIS', 'Madison', 'Emily', 'NEVAEH', 'ASHLEY', 'Isabella', 'ISABELLA', 'Sophia', 'OLIVIA', 'Leah', 'Esther', 'Mariam', 'JADA', 'London', 'TIFFANY', 'SERENITY', 'Emma', 'Savannah', 'CHAYA', 'KAYLA', 'SOFIA', 'ABIGAIL', 'Grace', 'Chaya', 'Taylor', 'ANGELA', 'Sarah', 'Brielle', 'MAKAYLA', 'EMMA', 'ESTHER', 'Ava', 'AALIYAH', 'HAILEY', 'MIRIAM', 'Skylar', 'SARAH', 'Fatoumata', 'Sofia']
boy_names = ['ANGEL', 'Jacob', 'Josiah', 'Daniel', 'CHRISTIAN', 'William', 'MASON', 'Eric', 'JUSTIN', 'LUCAS', 'Mason', 'TYLER', 'Elijah', 'Noah', 'ISAIAH', 'JEREMIAH', 'JOSHUA', 'JAYDEN', 'Samuel', 'KEVIN', 'AIDEN', 'James', 'Aiden', 'Alexander', 'ELIJAH', 'Benjamin', 'Jeremiah', 'Liam', 'Carter', 'ANTHONY', 'Ryan', 'DAVID', 'DANIEL', 'Joshua', 'JAMES', 'Joseph', 'JACOB', 'RYAN', 'Dylan', 'Ethan', 'JACK', 'NOAH', 'David', 'SAMUEL', 'Lucas', 'Matthew', 'Jack', 'Jason', 'ALEXANDER', 'MATTHEW', 'Michael', 'Jayden', 'MOSHE', 'ETHAN', 'JOSEPH', 'MUHAMMAD', 'SEBASTIAN', 'BENJAMIN', 'Moshe', 'Amir', 'Sebastian', 'MICHAEL', 'CHRISTOPHER', 'Angel', 'JOSIAH', 'ERIC', 'JASON', 'Muhammad']
# Pair up the boy and girl names: pairs
pairs = zip(girl_names, boy_names)
# Iterate over pairs
for idx, pair in enumerate(pairs):
# Unpack pair: girl_name, boy_name
girl_name, boy_name = pair
# Print the rank and names associated with each rank
print('Rank {}: {} and {}'.format(idx, girl_name, boy_name))
# Create the normal variable: normal
normal = "simple"
# Create the mistaken variable: error
error = 'trailing comma',
# Print the types of the variables
print(type(normal))
print(type(error))
# Same SSA process for 2014 baby names
pd2014 = pd.read_csv(myPath + "yob2014.txt", header=None, names=["Name", "Gender", "Count"])
# Speed the processing - keep only names with Count >= 5000
records2014 = []
for idx in pd2014.loc[pd2014["Count"] >= 5000].index:
rowData = pd2014.loc[idx]
newList = ["2014", rowData["Gender"], "NA", rowData["Name"], "NA", rowData["Count"]]
records2014.append(newList)
# Convert them to sets (only names with 5,000+)
baby_names_2011 = set(pd2011.loc[pd2011["Count"] >= 5000]["Name"])
baby_names_2014 = set(pd2014.loc[pd2014["Count"] >= 5000]["Name"])
# Find the union: all_names
all_names = baby_names_2011.union(baby_names_2014)
# Print the count of names in all_names
print(len(all_names))
# Find the intersection: overlapping_names
overlapping_names = baby_names_2011.intersection(baby_names_2014)
# Print the count of names in overlapping_names
print(len(overlapping_names))
# Create the empty set: baby_names_2011
baby_names_2011 = set()
# Loop over records and add the names from 2011 to the baby_names_2011 set
for row in records2011:
# Check if the first column is '2011'
if row[0] == '2011':
# Add the fourth column to the set
baby_names_2011.add(row[3])
# Find the difference between 2011 and 2014: differences
differences = baby_names_2011.difference(baby_names_2014)
# Print the differences
print(differences)
## ['Ximena', 'Aliza', 'Ayden', 'Calvin', 'Rowen', 'Sandeep']
## ['Ximena', 'Ayden', 'Calvin', 'Rowen', 'Sandeep']
## Aaliyah
## Aaron
## Abigail
## Adam
## Addison
## Adrian
## Aiden
## Alexander
## Alexis
## Allison
## Alyssa
## Amelia
## Andrew
## Angel
## Anna
## Anthony
## Ashley
## Aubrey
## Audrey
## Austin
## Ava
## Avery
## Ayden
## Benjamin
## Bentley
## Blake
## Brandon
## Brayden
## Brianna
## Brody
## Brooklyn
## Caleb
## Cameron
## Carter
## Charles
## Charlotte
## Chase
## Chloe
## Christian
## Christopher
## Colton
## Connor
## Cooper
## Daniel
## David
## Dominic
## Dylan
## Eli
## Elijah
## Elizabeth
## Ella
## Emily
## Emma
## Ethan
## Evan
## Evelyn
## Gabriel
## Gabriella
## Gavin
## Grace
## Hailey
## Hannah
## Henry
## Hunter
## Ian
## Isaac
## Isabella
## Isaiah
## Jack
## Jackson
## Jacob
## James
## Jason
## Jayden
## Jeremiah
## John
## Jonathan
## Jordan
## Jose
## Joseph
## Joshua
## Josiah
## Julian
## Justin
## Kaylee
## Kevin
## Landon
## Layla
## Leah
## Levi
## Liam
## Lillian
## Lily
## Logan
## Lucas
## Luke
## Madison
## Mason
## Matthew
## Mia
## Michael
## Natalie
## Nathan
## Nevaeh
## Nicholas
## Noah
## Oliver
## Olivia
## Owen
## Parker
## Riley
## Robert
## Ryan
## Samantha
## Samuel
## Sarah
## Savannah
## Sebastian
## Sofia
## Sophia
## Taylor
## Thomas
## Tyler
## Victoria
## William
## Wyatt
## Xavier
## Zachary
## Zoe
## Zoey
## Rank 0: GRACE and ANGEL
## Rank 1: Victoria and Jacob
## Rank 2: Rachel and Josiah
## Rank 3: Anna and Daniel
## Rank 4: Samantha and CHRISTIAN
## Rank 5: Kayla and William
## Rank 6: Claire and MASON
## Rank 7: Ashley and Eric
## Rank 8: Zoe and JUSTIN
## Rank 9: Alina and LUCAS
## Rank 10: Angela and Mason
## Rank 11: Olivia and TYLER
## Rank 12: AVA and Elijah
## Rank 13: Valentina and Noah
## Rank 14: CAMILA and ISAIAH
## Rank 15: Miriam and JEREMIAH
## Rank 16: MADISON and JOSHUA
## Rank 17: Aaliyah and JAYDEN
## Rank 18: RACHEL and Samuel
## Rank 19: Serenity and KEVIN
## Rank 20: EMILY and AIDEN
## Rank 21: Mia and James
## Rank 22: Chloe and Aiden
## Rank 23: MIA and Alexander
## Rank 24: LONDON and ELIJAH
## Rank 25: Chana and Benjamin
## Rank 26: TAYLOR and Jeremiah
## Rank 27: CHLOE and Liam
## Rank 28: FIONA and Carter
## Rank 29: Camila and ANTHONY
## Rank 30: GABRIELLE and Ryan
## Rank 31: SOPHIA and DAVID
## Rank 32: CHANA and DANIEL
## Rank 33: LEAH and Joshua
## Rank 34: ELLA and JAMES
## Rank 35: GENESIS and Joseph
## Rank 36: Madison and JACOB
## Rank 37: Emily and RYAN
## Rank 38: NEVAEH and Dylan
## Rank 39: ASHLEY and Ethan
## Rank 40: Isabella and JACK
## Rank 41: ISABELLA and NOAH
## Rank 42: Sophia and David
## Rank 43: OLIVIA and SAMUEL
## Rank 44: Leah and Lucas
## Rank 45: Esther and Matthew
## Rank 46: Mariam and Jack
## Rank 47: JADA and Jason
## Rank 48: London and ALEXANDER
## Rank 49: TIFFANY and MATTHEW
## Rank 50: SERENITY and Michael
## Rank 51: Emma and Jayden
## Rank 52: Savannah and MOSHE
## Rank 53: CHAYA and ETHAN
## Rank 54: KAYLA and JOSEPH
## Rank 55: SOFIA and MUHAMMAD
## Rank 56: ABIGAIL and SEBASTIAN
## Rank 57: Grace and BENJAMIN
## Rank 58: Chaya and Moshe
## Rank 59: Taylor and Amir
## Rank 60: ANGELA and Sebastian
## Rank 61: Sarah and MICHAEL
## Rank 62: Brielle and CHRISTOPHER
## Rank 63: MAKAYLA and Angel
## Rank 64: EMMA and JOSIAH
## Rank 65: ESTHER and ERIC
## Rank 66: Ava and JASON
## Rank 67: AALIYAH and Muhammad
## <class 'str'>
## <class 'tuple'>
## 143
## 113
## {'Riley', 'Aaliyah', 'Sarah', 'Brody', 'Justin', 'Bentley', 'Nevaeh', 'Blake', 'Cooper', 'Taylor', 'Hailey', 'Alexis', 'Kaylee', 'Brianna', 'Xavier', 'Alyssa', 'Ashley'}
Chapter 2 - Dictionaries
Using dictionaries - “everything in Python is a dictionary” is a common joke:
Altering dictionaries - dictionaries are mutable:
Pythonically using dictionaries - efficient means of interacting with dictionaries:
Working with CSV files (comma separated values files) - one of the most common storage systems:
Example code includes:
myPath = "./PythonInputFiles/"
# Create top-50 female_baby_names_2012 as list of (name, rank) tuples
import pandas as pd
pd2012 = pd.read_csv(myPath + "yob2012.txt", header=None, names=["Name", "Gender", "Count"])
babyTop = pd2012.loc[pd2012["Gender"] == "F"].sort_values("Count", ascending=False)
female_baby_names_2012 = list(zip(babyTop["Name"][0:50], list(range(1, 51))))
# Create an empty dictionary: names
names = {}
# Loop over the girl names
for name, rank in female_baby_names_2012:
# Add each name to the names dictionary using rank as the key
names[rank] = name
# Sort the names list by rank in descending order and slice the first 10 items (popularity 41-50)
for rank in sorted(names, reverse=True)[:10]:
# Print each item
print(names[rank])
# Safely print rank 7 from the names dictionary
print(names.get(7))
# Safely print the type of rank 100 from the names dictionary
print(type(names.get(100)))
# Safely print rank 105 from the names dictionary or 'Not Found'
print(names.get(105, "Not Found"))
# Create the boy_names dictionary - start with 2013
pd2013 = pd.read_csv(myPath + "yob2013.txt", header=None, names=["Name", "Gender", "Count"])
boyTop = pd2013.loc[pd2013["Gender"] == "M"].sort_values("Count", ascending=False)
male_baby_names_2013 = list( zip( list(range(1, 51)), boyTop["Name"][0:50] ) )
boyTop = pd2012.loc[pd2012["Gender"] == "M"].sort_values("Count", ascending=False)
male_baby_names_2012 = list( zip( list(range(1, 51)), boyTop["Name"][0:50] ) )
pd2011 = pd.read_csv(myPath + "yob2011.txt", header=None, names=["Name", "Gender", "Count"])
boyTop = pd2011.loc[pd2011["Gender"] == "M"].sort_values("Count", ascending=False)
male_baby_names_2011 = list( zip( list(range(1, 51)), boyTop["Name"][0:50] ) )
pd2014 = pd.read_csv(myPath + "yob2014.txt", header=None, names=["Name", "Gender", "Count"])
boyTop = pd2014.loc[pd2014["Gender"] == "M"].sort_values("Count", ascending=False)
male_baby_names_2014 = list( zip( list(range(1, 51)), boyTop["Name"][0:50] ) )
# male_baby_names_2013 is a dictionary of rank-name, nested in dictionary boy_names with key 2013
boy_names = { 2013 : dict(male_baby_names_2013) , 2012 : dict(male_baby_names_2012) , 2014 : dict(male_baby_names_2014)}
# Print a list of keys from the boy_names dictionary
print(boy_names.keys())
# Print a list of keys from the boy_names dictionary for the year 2013
print(boy_names[2013].keys())
# Loop over the dictionary
for year in boy_names:
# Safely print the year and the third ranked name or 'Unknown'
print(year, boy_names[year].get(3, "Unknown"))
# Assign the names_2011 dictionary as the value to the 2011 key of boy_names
boy_names[2011] = dict(male_baby_names_2011)
# Update the 2012 key in the boy_names dictionary
boy_names[2012].update([(1, 'Casey'), (2, 'Aiden')])
# Loop over the boy_names dictionary
for year in boy_names:
# Loop over and sort the data for each year by descending rank
for rank in sorted(boy_names[year], reverse=True)[:1]:
# Check that you have a rank
if not rank:
print(year, 'No Data Available')
# Safely print the year and the least popular name or 'Not Available'
print(year, boy_names[year].get(rank))
# Make the female_names dictionary of top-10 names by year
girlTop = pd2013.loc[pd2013["Gender"] == "F"].sort_values("Count", ascending=False)
female_baby_names_2013 = list( zip( list(range(1, 11)), girlTop["Name"][0:10] ) )
girlTop = pd2012.loc[pd2012["Gender"] == "F"].sort_values("Count", ascending=False)
female_baby_names_2012 = list( zip( list(range(1, 11)), girlTop["Name"][0:10] ) )
girlTop = pd2011.loc[pd2011["Gender"] == "F"].sort_values("Count", ascending=False)
female_baby_names_2011 = list( zip( list(range(1, 11)), girlTop["Name"][0:10] ) )
girlTop = pd2014.loc[pd2014["Gender"] == "F"].sort_values("Count", ascending=False)
female_baby_names_2014 = list( zip( list(range(1, 11)), girlTop["Name"][0:10] ) )
# female_names_2013 is a nested dictionary
female_names = { 2013 : dict(female_baby_names_2013) , 2012 : dict(female_baby_names_2012) , 2014 : dict(female_baby_names_2014), 2011: dict(female_baby_names_2011) }
# Remove 2011 and store it: female_names_2011
female_names_2011 = female_names.pop(2011)
# Safely remove 2015 with a empty dictionary as the default and store it: female_names_2015
female_names_2015 = female_names.pop(2015, {})
# Delete 2012
del female_names[2012]
# Print female_names
print(female_names)
# Iterate over the 2014 nested dictionary
for rank, name in female_names[2014].items():
# Print rank and name
print(rank, name)
# Iterate over the 2013 nested dictionary
for rank, name in female_names[2013].items():
# Print rank and name
print(rank, name)
# Check to see if 2011 is in female_names
if 2011 in female_names:
# Print 'Found 2011'
print('Found 2011')
# Check to see if rank 1 is in 2013
if 1 in female_names[2013]:
# Print 'Found Rank 1 in 2013' if found
print('Found Rank 1 in 2013')
else:
# Print 'Rank 1 missing from 2013' if not found
print('Rank 1 missing from 2013')
# Check to see if Rank 100 is in 2013
if 100 in female_names[2013]:
print('Found Rank 100')
else:
print('Rank 100 missing from 2013')
# Created top10 female names for 2013 as Year - "F" - "NA" - Name - "NA" - Rank
# topFemale = female_baby_names_2013
# rankData = [a for a, b in topFemale]
# nameData = [b for a, b in topFemale]
# babyData = pd.DataFrame( {"YEAR": 2013, "GENDER": "F", "FILL1": "NA", "NAME": nameData, "FILL2": "NA", "RANK": rankData} )[["YEAR", "GENDER", "FILL1", "NAME", "FILL2", "RANK"]]
# babyData.to_csv(myPath + "baby_names.csv", index=False)
# Import the python CSV module
import csv
# Create a python file object in read mode for the baby_names.csv file: csvfile
csvfile = open(myPath + "baby_names.csv", "r")
baby_names = {}
# Loop over a csv reader on the file object
for row in csv.reader(csvfile):
# Print each row
print(row)
# Add the rank and name to the dictionary
if row[5] != "RANK":
baby_names[int(row[5])] = row[3]
# Print the dictionary keys
print(baby_names.keys())
# Create a python file object in read mode for the `baby_names.csv` file: csvfile
csvfile = open(myPath + "baby_names.csv", "r")
baby_names = {}
# Loop over a DictReader on the file
for row in csv.DictReader(csvfile):
# Print each row
print(row)
# Add the rank and name to the dictionary: baby_names
baby_names[int(row["RANK"])] = row["NAME"]
# Print the dictionary
print(baby_names.keys())
## Ashley
## Arianna
## Camila
## Riley
## Taylor
## Claire
## Alyssa
## Sarah
## Savannah
## Audrey
## Abigail
## <class 'NoneType'>
## Not Found
## dict_keys([2013, 2012, 2014])
## dict_keys([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50])
## 2013 Liam
## 2012 Ethan
## 2014 Mason
## 2013 Levi
## 2012 Tyler
## 2014 Aaron
## 2011 Julian
## {2013: {1: 'Sophia', 2: 'Emma', 3: 'Olivia', 4: 'Isabella', 5: 'Ava', 6: 'Mia', 7: 'Emily', 8: 'Abigail', 9: 'Madison', 10: 'Elizabeth'}, 2014: {1: 'Emma', 2: 'Olivia', 3: 'Sophia', 4: 'Isabella', 5: 'Ava', 6: 'Mia', 7: 'Emily', 8: 'Abigail', 9: 'Madison', 10: 'Charlotte'}}
## 1 Emma
## 2 Olivia
## 3 Sophia
## 4 Isabella
## 5 Ava
## 6 Mia
## 7 Emily
## 8 Abigail
## 9 Madison
## 10 Charlotte
## 1 Sophia
## 2 Emma
## 3 Olivia
## 4 Isabella
## 5 Ava
## 6 Mia
## 7 Emily
## 8 Abigail
## 9 Madison
## 10 Elizabeth
## Found Rank 1 in 2013
## Rank 100 missing from 2013
## ['YEAR', 'GENDER', 'FILL1', 'NAME', 'FILL2', 'RANK']
## ['2013', 'F', 'NA', 'Sophia', 'NA', '1']
## ['2013', 'F', 'NA', 'Emma', 'NA', '2']
## ['2013', 'F', 'NA', 'Olivia', 'NA', '3']
## ['2013', 'F', 'NA', 'Isabella', 'NA', '4']
## ['2013', 'F', 'NA', 'Ava', 'NA', '5']
## ['2013', 'F', 'NA', 'Mia', 'NA', '6']
## ['2013', 'F', 'NA', 'Emily', 'NA', '7']
## ['2013', 'F', 'NA', 'Abigail', 'NA', '8']
## ['2013', 'F', 'NA', 'Madison', 'NA', '9']
## ['2013', 'F', 'NA', 'Elizabeth', 'NA', '10']
## dict_keys([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
## OrderedDict([('YEAR', '2013'), ('GENDER', 'F'), ('FILL1', 'NA'), ('NAME', 'Sophia'), ('FILL2', 'NA'), ('RANK', '1')])
## OrderedDict([('YEAR', '2013'), ('GENDER', 'F'), ('FILL1', 'NA'), ('NAME', 'Emma'), ('FILL2', 'NA'), ('RANK', '2')])
## OrderedDict([('YEAR', '2013'), ('GENDER', 'F'), ('FILL1', 'NA'), ('NAME', 'Olivia'), ('FILL2', 'NA'), ('RANK', '3')])
## OrderedDict([('YEAR', '2013'), ('GENDER', 'F'), ('FILL1', 'NA'), ('NAME', 'Isabella'), ('FILL2', 'NA'), ('RANK', '4')])
## OrderedDict([('YEAR', '2013'), ('GENDER', 'F'), ('FILL1', 'NA'), ('NAME', 'Ava'), ('FILL2', 'NA'), ('RANK', '5')])
## OrderedDict([('YEAR', '2013'), ('GENDER', 'F'), ('FILL1', 'NA'), ('NAME', 'Mia'), ('FILL2', 'NA'), ('RANK', '6')])
## OrderedDict([('YEAR', '2013'), ('GENDER', 'F'), ('FILL1', 'NA'), ('NAME', 'Emily'), ('FILL2', 'NA'), ('RANK', '7')])
## OrderedDict([('YEAR', '2013'), ('GENDER', 'F'), ('FILL1', 'NA'), ('NAME', 'Abigail'), ('FILL2', 'NA'), ('RANK', '8')])
## OrderedDict([('YEAR', '2013'), ('GENDER', 'F'), ('FILL1', 'NA'), ('NAME', 'Madison'), ('FILL2', 'NA'), ('RANK', '9')])
## OrderedDict([('YEAR', '2013'), ('GENDER', 'F'), ('FILL1', 'NA'), ('NAME', 'Elizabeth'), ('FILL2', 'NA'), ('RANK', '10')])
## dict_keys([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
Chapter 3 - Collections Module
Counting made easy - collections module (advanced data containers; part of Standard Library):
Dictionaries of unknown structure - default dictionaries:
Maintaining dictionary order with OrderedDict:
Class and Namedtuple - a namedtuple is a tuple where each position has a name:
Example code includes:
myPath = "./PythonInputFiles/"
# Create stations data from the CSV downloaded from Chicago Open Data
# https://data.cityofchicago.org/Transportation/CTA-Ridership-L-Station-Entries-Daily-Totals/5neh-572f/data
# Filtered the data to download only 2015-2016
import pandas as pd
statRaw = pd.read_csv(myPath + "CTA_Ridership_Station_Entries_Daily_Totals.csv")
statRaw.head()
len(statRaw["stationname"].value_counts())
# stations originally a list of length 100801 of CTA stations (700 each of 144 stations, plus "station_name")
# Make it a 731 of all days in 2015 and 2016 instead
stations = list(statRaw["stationname"])
# Import the Counter object
from collections import Counter
# Print the first ten items from the stations list
print(stations[:10])
# Create a Counter of the stations list: station_count
station_count = Counter(stations)
# Print the station_count
print(station_count)
# Create a Counter of the stations list: station_count
station_count = Counter(stations)
# Find the 5 most common elements
print(station_count.most_common(5))
# Create entries as an enumerator that can be unpacked to date-stop-riders
# miniStat = statRaw.iloc[0:100, :]
entries = zip(statRaw["date"], statRaw["stationname"], statRaw["rides"])
# Create an empty dictionary: ridership
ridership = {}
# Iterate over the entries
for date, stop, riders in entries:
# Check to see if date is already in the dictionary
if date not in ridership:
# Create an empty list for any missing date
ridership[date] = []
# Append the stop and riders as a tuple to the date keys list
ridership[date].append((stop, riders))
# Print the ridership for '03/09/2016'
print(ridership["03/09/2016"])
# Import defaultdict
from collections import defaultdict
# Create a defaultdict with a default type of list: ridership
ridership = defaultdict(list)
# Need to re-create the enumerator - it is gone when used above!
entries = zip(statRaw["date"], statRaw["stationname"], statRaw["rides"])
# Iterate over the entries
for date, stop, riders in entries:
# Use the stop as the key of ridership and append the riders to its value
ridership[stop].append(riders)
# Print the first 10 items of the ridership dictionary
# print(list(ridership.items())[:10]) # a spectacularly bad idea due to length!
[(a, len(x), sum(x)) for a, x in list(ridership.items())[:10]] # just to get a sense for the data
# Import OrderedDict from collections
from collections import OrderedDict
# Create an OrderedDict called: ridership_date
ridership_date = OrderedDict()
# Need to re-create the enumerator - only want date and riders this time!
entries = zip(statRaw["date"], statRaw["rides"])
# Iterate over the entries
for date, riders in entries:
# If a key does not exist in ridership_date, set it to 0
if not date in ridership_date:
ridership_date[date] = 0
# Add riders to the date key in ridership_date
ridership_date[date] += riders
# Print the first 31 records
print(list(ridership_date.items())[:31])
# Print the first key in ridership_date
print(list(ridership_date.keys())[0])
# Pop the first item from ridership_date and print it
print(ridership_date.popitem(last=False))
# Print the last key in ridership_date
print(list(ridership_date.keys())[-1])
# Pop the last item from ridership_date and print it
print(ridership_date.popitem())
# Import namedtuple from collections
from collections import namedtuple
# Create the namedtuple: DateDetails
DateDetails = namedtuple('DateDetails', ['date', 'stop', 'riders'])
# Create the empty list: labeled_entries
labeled_entries = []
# Need to re-create the enumerator - it is gone when used above!
entries = zip(statRaw["date"], statRaw["stationname"], statRaw["rides"])
# Iterate over the entries
for date, stop, riders in entries:
# Append a new DateDetails namedtuple instance for each entry to labeled_entries
labeled_entries.append(DateDetails(date, stop, riders))
# Print the first 5 items in labeled_entries
print(labeled_entries[:5])
# Iterate over the first twenty items in labeled_entries
for item in labeled_entries[:20]:
# Print each item's stop, date, and riders
print(item.date, item.riders, item.stop)
## ['Austin-Forest Park', 'Harlem-Lake', 'Pulaski-Lake', 'Quincy/Wells', 'Davis', "Belmont-O'Hare", 'Jackson/Dearborn', 'Sheridan', 'Damen-Brown', 'Morse']
## Counter({'Austin-Forest Park': 731, 'Harlem-Lake': 731, 'Pulaski-Lake': 731, 'Quincy/Wells': 731, 'Davis': 731, "Belmont-O'Hare": 731, 'Jackson/Dearborn': 731, 'Sheridan': 731, 'Damen-Brown': 731, 'Morse': 731, '35th/Archer': 731, '51st': 731, 'Dempster-Skokie': 731, 'Pulaski-Cermak': 731, 'LaSalle/Van Buren': 731, 'Ashland-Lake': 731, 'Oak Park-Forest Park': 731, 'Sox-35th-Dan Ryan': 731, 'Randolph/Wabash': 731, 'Damen-Cermak': 731, 'Western-Forest Park': 731, 'Cumberland': 731, '79th': 731, 'Kedzie-Homan-Forest Park': 731, 'State/Lake': 731, 'Main': 731, 'Central-Lake': 731, 'Ashland/63rd': 731, 'Indiana': 731, 'Western-Orange': 731, 'Division/Milwaukee': 731, 'Grand/State': 731, 'Berwyn': 731, 'UIC-Halsted': 731, 'Southport': 731, 'Washington/Dearborn': 731, 'Clark/Lake': 731, 'Forest Park': 731, 'Noyes': 731, 'Cicero-Cermak': 731, 'Clinton-Forest Park': 731, 'California-Cermak': 731, '95th/Dan Ryan': 731, 'Merchandise Mart': 731, 'Racine': 731, 'Cicero-Lake': 731, 'Grand/Milwaukee': 731, 'Garfield-South Elevated': 731, 'Foster': 731, 'Diversey': 731, 'Wilson': 731, "Irving Park-O'Hare": 731, 'Jackson/State': 731, 'California/Milwaukee': 731, '54th/Cermak': 731, 'Damen/Milwaukee': 731, 'Kostner': 731, 'Ridgeland': 731, 'Clark/Division': 731, 'Madison/Wabash': 731, 'North/Clybourn': 731, 'Armitage': 731, 'Western/Milwaukee': 731, 'Adams/Wabash': 731, 'Dempster': 731, 'Laramie': 731, 'Chicago/Franklin': 731, 'East 63rd-Cottage Grove': 731, 'Washington/Wells': 731, 'Western-Cermak': 731, "Harlem-O'Hare": 731, 'Granville': 731, 'Lawrence': 731, 'Central Park': 731, 'Monroe/Dearborn': 731, 'Sedgwick': 731, 'Medical Center': 731, 'Rosemont': 731, '18th': 731, 'South Boulevard': 731, 'Library': 731, 'Francisco': 731, 'Thorndale': 731, "O'Hare Airport": 731, 'Howard': 731, '63rd-Dan Ryan': 731, 'Pulaski-Forest Park': 731, 'Midway Airport': 731, 'Halsted/63rd': 731, 'Pulaski-Orange': 731, 'Cicero-Forest Park': 731, 'Harlem-Forest Park': 731, '69th': 731, 'Cermak-Chinatown': 731, 'Rockwell': 731, 'Logan Square': 731, 'Polk': 731, 'Kedzie-Cermak': 731, 'Linden': 731, 'Ashland-Orange': 731, 'Kedzie-Lake': 731, '47th-South Elevated': 731, 'Monroe/State': 731, '35-Bronzeville-IIT': 731, 'Halsted-Orange': 731, 'King Drive': 731, 'Kedzie-Midway': 731, 'Clinton-Lake': 731, 'Garfield-Dan Ryan': 731, 'Kedzie-Brown': 731, 'Jarvis': 731, 'Argyle': 731, 'Wellington': 731, 'Fullerton': 731, '47th-Dan Ryan': 731, "Addison-O'Hare": 731, 'Central-Evanston': 731, 'Austin-Lake': 731, '43rd': 731, 'Jefferson Park': 731, 'Kimball': 731, 'Loyola': 731, 'Paulina': 731, 'Belmont-North Main': 731, "Montrose-O'Hare": 731, 'LaSalle': 731, 'Oak Park-Lake': 731, 'California-Lake': 731, 'Bryn Mawr': 731, 'Roosevelt': 731, 'Chicago/Milwaukee': 731, 'Addison-North Main': 731, '87th': 731, 'Addison-Brown': 731, 'Chicago/State': 731, 'Irving Park-Brown': 731, 'Western-Brown': 731, 'Harrison': 731, 'Montrose-Brown': 731, 'Morgan-Lake': 731, 'Lake/State': 731, 'Conservatory': 731, 'Oakton-Skokie': 731, 'Cermak-McCormick Place': 731})
## [('Austin-Forest Park', 731), ('Harlem-Lake', 731), ('Pulaski-Lake', 731), ('Quincy/Wells', 731), ('Davis', 731)]
## [('Austin-Forest Park', 2128), ('Harlem-Lake', 3769), ('Pulaski-Lake', 1502), ('Quincy/Wells', 8139), ('Davis', 3656), ("Belmont-O'Hare", 5294), ('Jackson/Dearborn', 8369), ('Sheridan', 5823), ('Damen-Brown', 3048), ('Morse', 4826), ('35th/Archer', 3450), ('51st', 1033), ('Dempster-Skokie', 1697), ('Pulaski-Cermak', 1259), ('LaSalle/Van Buren', 3104), ('Ashland-Lake', 2486), ('Oak Park-Forest Park', 1882), ('Sox-35th-Dan Ryan', 4967), ('Randolph/Wabash', 9659), ('Damen-Cermak', 1572), ('Western-Forest Park', 1819), ('Cumberland', 4589), ('79th', 7476), ('Kedzie-Homan-Forest Park', 2256), ('State/Lake', 10594), ('Main', 1129), ('Central-Lake', 2145), ('Ashland/63rd', 1302), ('Indiana', 919), ('Western-Orange', 3958), ('Division/Milwaukee', 6580), ('Grand/State', 10949), ('Berwyn', 3539), ('UIC-Halsted', 7523), ('Southport', 3467), ('Washington/Dearborn', 12365), ('Clark/Lake', 21640), ('Forest Park', 3636), ('Noyes', 941), ('Cicero-Cermak', 1271), ('Clinton-Forest Park', 4016), ('California-Cermak', 1627), ('95th/Dan Ryan', 11509), ('Merchandise Mart', 8345), ('Racine', 2598), ('Cicero-Lake', 1485), ('Grand/Milwaukee', 2851), ('Garfield-South Elevated', 1413), ('Foster', 963), ('Diversey', 5771), ('Wilson', 6470), ("Irving Park-O'Hare", 4808), ('Jackson/State', 12445), ('California/Milwaukee', 5413), ('54th/Cermak', 2170), ('Damen/Milwaukee', 7022), ('Kostner', 556), ('Ridgeland', 1353), ('Clark/Division', 8216), ('Madison/Wabash', 0), ('North/Clybourn', 6360), ('Armitage', 4575), ('Western/Milwaukee', 5511), ('Adams/Wabash', 9666), ('Dempster', 788), ('Laramie', 1328), ('Chicago/Franklin', 6868), ('East 63rd-Cottage Grove', 1135), ('Washington/Wells', 8267), ('Western-Cermak', 1182), ("Harlem-O'Hare", 3202), ('Granville', 3762), ('Lawrence', 3355), ('Central Park', 1342), ('Monroe/Dearborn', 7972), ('Sedgwick', 4004), ('Medical Center', 3581), ('Rosemont', 6101), ('18th', 2028), ('South Boulevard', 813), ('Library', 4127), ('Francisco', 1617), ('Thorndale', 3355), ("O'Hare Airport", 9742), ('Howard', 5935), ('63rd-Dan Ryan', 3500), ('Pulaski-Forest Park', 2110), ('Midway Airport', 8698), ('Halsted/63rd', 839), ('Pulaski-Orange', 5663), ('Cicero-Forest Park', 1475), ('Harlem-Forest Park', 1185), ('69th', 5790), ('Cermak-Chinatown', 4312), ('Rockwell', 1996), ('Logan Square', 7536), ('Polk', 3750), ('Kedzie-Cermak', 1181), ('Linden', 817), ('Ashland-Orange', 1637), ('Kedzie-Lake', 1753), ('47th-South Elevated', 1347), ('Monroe/State', 11264), ('35-Bronzeville-IIT', 1901), ('Halsted-Orange', 3162), ('King Drive', 651), ('Kedzie-Midway', 3552), ('Clinton-Lake', 4278), ('Garfield-Dan Ryan', 3676), ('Kedzie-Brown', 2039), ('Jarvis', 1817), ('Argyle', 3152), ('Wellington', 3242), ('Fullerton', 15150), ('47th-Dan Ryan', 3331), ("Addison-O'Hare", 3563), ('Central-Evanston', 802), ('Austin-Lake', 1994), ('43rd', 1090), ('Jefferson Park', 7112), ('Kimball', 4236), ('Loyola', 4712), ('Paulina', 2895), ('Belmont-North Main', 12936), ("Montrose-O'Hare", 2529), ('LaSalle', 3556), ('Oak Park-Lake', 1561), ('California-Lake', 1125), ('Bryn Mawr', 4888), ('Roosevelt', 11055), ('Chicago/Milwaukee', 4605), ('Addison-North Main', 6719), ('87th', 4473), ('Addison-Brown', 2754), ('Chicago/State', 13946), ('Irving Park-Brown', 3268), ('Western-Brown', 4273), ('Harrison', 4750), ('Montrose-Brown', 2875), ('Morgan-Lake', 2700), ('Lake/State', 21708), ('Conservatory', 999), ('Oakton-Skokie', 839), ('Cermak-McCormick Place', 1208)]
## [('01/01/2015', 233956), ('01/02/2015', 432144), ('01/03/2015', 273207), ('01/04/2015', 217632), ('01/05/2015', 538868), ('01/06/2015', 556918), ('01/07/2015', 416984), ('01/08/2015', 475074), ('01/09/2015', 524144), ('01/10/2015', 282850), ('01/11/2015', 227240), ('01/12/2015', 605068), ('01/13/2015', 609226), ('01/14/2015', 608109), ('01/15/2015', 622792), ('01/16/2015', 612833), ('01/17/2015', 335555), ('01/18/2015', 244490), ('01/19/2015', 411497), ('01/20/2015', 618377), ('01/21/2015', 619945), ('01/22/2015', 623914), ('01/23/2015', 612177), ('01/24/2015', 333440), ('01/25/2015', 226964), ('01/26/2015', 605287), ('01/27/2015', 626168), ('01/28/2015', 625531), ('01/29/2015', 622695), ('01/30/2015', 618395), ('01/31/2015', 337018)]
## 01/01/2015
## ('01/01/2015', 233956)
## 12/31/2016
## ('12/31/2016', 295002)
## [DateDetails(date='01/01/2015', stop='Austin-Forest Park', riders=587), DateDetails(date='01/01/2015', stop='Harlem-Lake', riders=1106), DateDetails(date='01/01/2015', stop='Pulaski-Lake', riders=811), DateDetails(date='01/01/2015', stop='Quincy/Wells', riders=1117), DateDetails(date='01/01/2015', stop='Davis', riders=1400)]
## 01/01/2015 587 Austin-Forest Park
## 01/01/2015 1106 Harlem-Lake
## 01/01/2015 811 Pulaski-Lake
## 01/01/2015 1117 Quincy/Wells
## 01/01/2015 1400 Davis
## 01/01/2015 2023 Belmont-O'Hare
## 01/01/2015 1730 Jackson/Dearborn
## 01/01/2015 2616 Sheridan
## 01/01/2015 751 Damen-Brown
## 01/01/2015 2433 Morse
## 01/01/2015 862 35th/Archer
## 01/01/2015 430 51st
## 01/01/2015 542 Dempster-Skokie
## 01/01/2015 491 Pulaski-Cermak
## 01/01/2015 270 LaSalle/Van Buren
## 01/01/2015 833 Ashland-Lake
## 01/01/2015 416 Oak Park-Forest Park
## 01/01/2015 1862 Sox-35th-Dan Ryan
## 01/01/2015 2267 Randolph/Wabash
## 01/01/2015 451 Damen-Cermak
Chapter 4 - Handling Dates and Times
DateTime journey - leap years, different length months, time zones, holidays, etc.:
Working with DateTime components and current time:
Adding and subtracting time - the timedelta object:
Libraries to simplify this process:
Example code includes:
myPath = "./PythonInputFiles/"
from collections import defaultdict
dates_list = ['02/19/2001', '04/10/2001', '05/30/2001', '07/19/2001', '09/07/2001', '10/27/2001', '12/16/2001', '02/04/2002', '03/26/2002', '05/15/2002', '07/04/2002', '08/23/2002', '10/12/2002', '12/01/2002', '01/20/2003', '03/11/2003', '04/30/2003', '06/19/2003', '08/08/2003', '09/27/2003', '11/16/2003', '01/05/2004', '02/24/2004', '04/14/2004', '06/03/2004', '07/23/2004', '09/11/2004', '10/31/2004', '12/20/2004', '02/08/2005', '03/30/2005', '05/19/2005', '07/08/2005', '08/27/2005', '10/16/2005', '12/05/2005', '01/24/2006', '03/15/2006', '05/04/2006', '06/23/2006', '08/12/2006', '10/01/2006', '11/20/2006', '01/09/2007', '02/28/2007', '04/19/2007', '06/08/2007', '07/28/2007', '09/16/2007', '11/05/2007', '12/25/2007', '02/13/2008', '04/03/2008', '05/23/2008', '07/12/2008', '08/31/2008', '10/20/2008', '12/09/2008', '01/28/2009', '03/19/2009', '05/08/2009', '06/27/2009', '08/16/2009', '10/05/2009', '11/24/2009', '01/13/2010', '03/04/2010', '04/23/2010', '06/12/2010', '08/01/2010', '09/20/2010', '11/09/2010', '12/29/2010', '02/17/2011', '04/08/2011', '05/28/2011', '07/17/2011', '09/05/2011', '10/24/2011', '11/12/2011', '01/01/2012', '02/20/2012', '04/10/2012', '05/30/2012', '07/19/2012', '09/07/2012', '10/27/2012', '12/16/2012', '02/04/2013', '03/26/2013', '05/15/2013', '07/04/2013', '08/23/2013', '10/12/2013', '12/01/2013', '01/20/2014', '03/11/2014', '04/30/2014', '06/19/2014', '08/08/2014', '09/27/2014', '11/16/2014', '07/05/2014', '01/24/2015', '03/15/2015', '05/04/2015', '06/23/2015', '08/12/2015', '10/01/2015', '11/20/2015', '01/09/2016', '02/28/2016', '04/18/2016', '06/07/2016', '07/27/2016', '09/15/2016', '11/04/2016']
# Import the datetime object from datetime
from datetime import datetime
# Iterate over the dates_list
for date_str in dates_list:
# Convert each date to a datetime object: date_dt
date_dt = datetime.strptime(date_str, "%m/%d/%Y")
# Print each date_dt
print(date_dt)
datetimes_list = [datetime(2001, 2, 19, 0, 0), datetime(2001, 4, 10, 0, 0), datetime(2001, 5, 30, 0, 0), datetime(2001, 7, 19, 0, 0), datetime(2001, 9, 7, 0, 0), datetime(2001, 10, 27, 0, 0), datetime(2001, 12, 16, 0, 0), datetime(2002, 2, 4, 0, 0), datetime(2002, 3, 26, 0, 0), datetime(2002, 5, 15, 0, 0)]
# Loop over datetimes_list
for item in datetimes_list:
# Print out the record as a string in the format of 'MM/DD/YYYY'
print(item.strftime('%m/%d/%Y'))
# Print out the record as an ISO standard string
print(item.isoformat())
# Create stations data from the CSV downloaded from Chicago Open Data
# https://data.cityofchicago.org/Transportation/CTA-Ridership-L-Station-Entries-Daily-Totals/5neh-572f/data
# Filtered the data to download only 2015-2016
import pandas as pd
statRaw = pd.read_csv(myPath + "CTA_Ridership_Station_Entries_Daily_Totals.csv")
statRaw.head()
# mock up daily_summaries as tuple date-rides
x = statRaw.groupby("date")["rides"].sum()
daily_summaries = zip(x.index, x)
# Create a defaultdict of an integer: monthly_total_rides
monthly_total_rides = defaultdict(int)
# Loop over the list daily_summaries
for daily_summary in daily_summaries:
# Convert the service_date to a datetime object
service_datetime = datetime.strptime(daily_summary[0], '%m/%d/%Y')
# Add the total rides to the current amount for the month
monthly_total_rides[service_datetime.month] =+ int(daily_summary[1])
# Print monthly_total_rides
print(monthly_total_rides)
# Import datetime from the datetime module
from datetime import datetime
# Compute the local datetime: local_dt
local_dt = datetime.now()
# Print the local datetime
print(local_dt)
# Compute the UTC datetime: utc_dt
utc_dt = datetime.utcnow()
# Print the UTC datetime
print(utc_dt)
from pytz import timezone
daily_summaries = [(datetime(2001, 1, 1, 10, 27), '126455'), (datetime(2001, 1, 2, 6, 34), '501952'), (datetime(2001, 1, 3, 22, 17), '536432'), (datetime(2001, 1, 4, 15, 20), '550011'), (datetime(2001, 1, 5, 11, 35), '557917'), (datetime(2001, 1, 6, 1, 33), '255356'), (datetime(2001, 1, 7, 5, 58), '169825'), (datetime(2001, 1, 8, 19, 28), '590706'), (datetime(2001, 1, 9, 13, 55), '599905')]
# Create a Timezone object for Chicago
chicago_usa_tz = timezone('US/Central')
# Create a Timezone object for New York
ny_usa_tz = timezone('US/Eastern')
# Iterate over the daily_summaries list
for orig_dt, ridership in daily_summaries:
# Make the orig_dt timezone "aware" for Chicago
chicago_dt = orig_dt.replace(tzinfo=chicago_usa_tz)
# Convert chicago_dt to the New York Timezone
ny_dt = chicago_dt.astimezone(ny_usa_tz)
# Print the chicago_dt, ny_dt, and ridership
print('Chicago: %s, NY: %s, Ridership: %s' % (chicago_dt, ny_dt, ridership))
review_dates = [datetime(2015, 12, 22, 0, 0), datetime(2015, 12, 23, 0, 0), datetime(2015, 12, 24, 0, 0), datetime(2015, 12, 25, 0, 0), datetime(2015, 12, 26, 0, 0), datetime(2015, 12, 27, 0, 0), datetime(2015, 12, 28, 0, 0), datetime(2015, 12, 29, 0, 0), datetime(2015, 12, 30, 0, 0), datetime(2015, 12, 31, 0, 0)]
# Create a daily_summaries that can be used below
statRaw = pd.read_csv(myPath + "CTA_Ridership_Station_Entries_Daily_Totals.csv")
statRaw.head()
# mock up daily_summaries as tuple date-rides
x = statRaw.groupby(["date", "daytype"])["rides"].sum()
daily_summaries = pd.DataFrame( {"day_type":[a[1] for a in x.index], "total_ridership":[a for a in x]} , index=[ datetime.strptime(a[0], '%m/%d/%Y') for a in x.index]).sort_index()
daily_summaries.head()
# Import timedelta from the datetime module
from datetime import timedelta
# Build a timedelta of 30 days: glanceback
glanceback = timedelta(days=30)
# Iterate over the review_dates as date
for date in review_dates:
# Calculate the date 30 days back: prior_period_dt
prior_period_dt = date - glanceback
# Print the review_date, day_type and total_ridership
print('Date: %s, Type: %s, Total Ridership: %s' %
(date,
daily_summaries.loc[date]['day_type'],
daily_summaries.loc[date]['total_ridership']))
# Print the prior_period_dt, day_type and total_ridership
print('Date: %s, Type: %s, Total Ridership: %s' %
(prior_period_dt,
daily_summaries.loc[prior_period_dt]['day_type'],
daily_summaries.loc[prior_period_dt]['total_ridership']))
# Iterate over the date_ranges
# for start_date, end_date in date_ranges:
# Print the End and Start Date
# print(end_date, start_date)
# Print the difference between each end and start date
# print(end_date - start_date)
# Import the pendulum module
import pendulum
# Create a now datetime for Tokyo: tokyo_dt
tokyo_dt = pendulum.now("Asia/Tokyo")
# Covert the tokyo_dt to Los Angeles: la_dt
la_dt = tokyo_dt.in_timezone('America/Los_Angeles')
# Print the ISO 8601 string of la_dt
print(la_dt.to_iso8601_string())
# Iterate over date_ranges
# for start_date, end_date in date_ranges:
# Convert the start_date string to a pendulum date: start_dt
# start_dt = pendulum.parse(start_date)
# Convert the end_date string to a pendulum date: end_dt
# end_dt = pendulum.parse(end_date)
# Print the End and Start Date
# print(end_dt, start_dt)
# Calculate the difference between end_dt and start_dt: diff_period
# diff_period = end_dt - start_dt
# Print the difference in days
# print(diff_period.in_days())
## 2001-02-19 00:00:00
## 2001-04-10 00:00:00
## 2001-05-30 00:00:00
## 2001-07-19 00:00:00
## 2001-09-07 00:00:00
## 2001-10-27 00:00:00
## 2001-12-16 00:00:00
## 2002-02-04 00:00:00
## 2002-03-26 00:00:00
## 2002-05-15 00:00:00
## 2002-07-04 00:00:00
## 2002-08-23 00:00:00
## 2002-10-12 00:00:00
## 2002-12-01 00:00:00
## 2003-01-20 00:00:00
## 2003-03-11 00:00:00
## 2003-04-30 00:00:00
## 2003-06-19 00:00:00
## 2003-08-08 00:00:00
## 2003-09-27 00:00:00
## 2003-11-16 00:00:00
## 2004-01-05 00:00:00
## 2004-02-24 00:00:00
## 2004-04-14 00:00:00
## 2004-06-03 00:00:00
## 2004-07-23 00:00:00
## 2004-09-11 00:00:00
## 2004-10-31 00:00:00
## 2004-12-20 00:00:00
## 2005-02-08 00:00:00
## 2005-03-30 00:00:00
## 2005-05-19 00:00:00
## 2005-07-08 00:00:00
## 2005-08-27 00:00:00
## 2005-10-16 00:00:00
## 2005-12-05 00:00:00
## 2006-01-24 00:00:00
## 2006-03-15 00:00:00
## 2006-05-04 00:00:00
## 2006-06-23 00:00:00
## 2006-08-12 00:00:00
## 2006-10-01 00:00:00
## 2006-11-20 00:00:00
## 2007-01-09 00:00:00
## 2007-02-28 00:00:00
## 2007-04-19 00:00:00
## 2007-06-08 00:00:00
## 2007-07-28 00:00:00
## 2007-09-16 00:00:00
## 2007-11-05 00:00:00
## 2007-12-25 00:00:00
## 2008-02-13 00:00:00
## 2008-04-03 00:00:00
## 2008-05-23 00:00:00
## 2008-07-12 00:00:00
## 2008-08-31 00:00:00
## 2008-10-20 00:00:00
## 2008-12-09 00:00:00
## 2009-01-28 00:00:00
## 2009-03-19 00:00:00
## 2009-05-08 00:00:00
## 2009-06-27 00:00:00
## 2009-08-16 00:00:00
## 2009-10-05 00:00:00
## 2009-11-24 00:00:00
## 2010-01-13 00:00:00
## 2010-03-04 00:00:00
## 2010-04-23 00:00:00
## 2010-06-12 00:00:00
## 2010-08-01 00:00:00
## 2010-09-20 00:00:00
## 2010-11-09 00:00:00
## 2010-12-29 00:00:00
## 2011-02-17 00:00:00
## 2011-04-08 00:00:00
## 2011-05-28 00:00:00
## 2011-07-17 00:00:00
## 2011-09-05 00:00:00
## 2011-10-24 00:00:00
## 2011-11-12 00:00:00
## 2012-01-01 00:00:00
## 2012-02-20 00:00:00
## 2012-04-10 00:00:00
## 2012-05-30 00:00:00
## 2012-07-19 00:00:00
## 2012-09-07 00:00:00
## 2012-10-27 00:00:00
## 2012-12-16 00:00:00
## 2013-02-04 00:00:00
## 2013-03-26 00:00:00
## 2013-05-15 00:00:00
## 2013-07-04 00:00:00
## 2013-08-23 00:00:00
## 2013-10-12 00:00:00
## 2013-12-01 00:00:00
## 2014-01-20 00:00:00
## 2014-03-11 00:00:00
## 2014-04-30 00:00:00
## 2014-06-19 00:00:00
## 2014-08-08 00:00:00
## 2014-09-27 00:00:00
## 2014-11-16 00:00:00
## 2014-07-05 00:00:00
## 2015-01-24 00:00:00
## 2015-03-15 00:00:00
## 2015-05-04 00:00:00
## 2015-06-23 00:00:00
## 2015-08-12 00:00:00
## 2015-10-01 00:00:00
## 2015-11-20 00:00:00
## 2016-01-09 00:00:00
## 2016-02-28 00:00:00
## 2016-04-18 00:00:00
## 2016-06-07 00:00:00
## 2016-07-27 00:00:00
## 2016-09-15 00:00:00
## 2016-11-04 00:00:00
## 02/19/2001
## 2001-02-19T00:00:00
## 04/10/2001
## 2001-04-10T00:00:00
## 05/30/2001
## 2001-05-30T00:00:00
## 07/19/2001
## 2001-07-19T00:00:00
## 09/07/2001
## 2001-09-07T00:00:00
## 10/27/2001
## 2001-10-27T00:00:00
## 12/16/2001
## 2001-12-16T00:00:00
## 02/04/2002
## 2002-02-04T00:00:00
## 03/26/2002
## 2002-03-26T00:00:00
## 05/15/2002
## 2002-05-15T00:00:00
## defaultdict(<class 'int'>, {1: 238267, 2: 609798, 3: 622394, 4: 335950, 5: 619492, 6: 641310, 7: 383347, 8: 640894, 9: 649963, 10: 658584, 11: 631904, 12: 295002})
## 2017-08-16 08:22:47.689722
## 2017-08-16 13:22:47.689722
## Chicago: 2001-01-01 10:27:00-05:51, NY: 2001-01-01 11:18:00-05:00, Ridership: 126455
## Chicago: 2001-01-02 06:34:00-05:51, NY: 2001-01-02 07:25:00-05:00, Ridership: 501952
## Chicago: 2001-01-03 22:17:00-05:51, NY: 2001-01-03 23:08:00-05:00, Ridership: 536432
## Chicago: 2001-01-04 15:20:00-05:51, NY: 2001-01-04 16:11:00-05:00, Ridership: 550011
## Chicago: 2001-01-05 11:35:00-05:51, NY: 2001-01-05 12:26:00-05:00, Ridership: 557917
## Chicago: 2001-01-06 01:33:00-05:51, NY: 2001-01-06 02:24:00-05:00, Ridership: 255356
## Chicago: 2001-01-07 05:58:00-05:51, NY: 2001-01-07 06:49:00-05:00, Ridership: 169825
## Chicago: 2001-01-08 19:28:00-05:51, NY: 2001-01-08 20:19:00-05:00, Ridership: 590706
## Chicago: 2001-01-09 13:55:00-05:51, NY: 2001-01-09 14:46:00-05:00, Ridership: 599905
## Date: 2015-12-22 00:00:00, Type: W, Total Ridership: 547458
## Date: 2015-11-22 00:00:00, Type: U, Total Ridership: 276222
## Date: 2015-12-23 00:00:00, Type: W, Total Ridership: 471055
## Date: 2015-11-23 00:00:00, Type: W, Total Ridership: 642924
## Date: 2015-12-24 00:00:00, Type: W, Total Ridership: 312039
## Date: 2015-11-24 00:00:00, Type: W, Total Ridership: 662887
## Date: 2015-12-25 00:00:00, Type: U, Total Ridership: 133225
## Date: 2015-11-25 00:00:00, Type: W, Total Ridership: 549277
## Date: 2015-12-26 00:00:00, Type: A, Total Ridership: 239119
## Date: 2015-11-26 00:00:00, Type: U, Total Ridership: 191233
## Date: 2015-12-27 00:00:00, Type: U, Total Ridership: 223687
## Date: 2015-11-27 00:00:00, Type: W, Total Ridership: 337460
## Date: 2015-12-28 00:00:00, Type: W, Total Ridership: 399002
## Date: 2015-11-28 00:00:00, Type: A, Total Ridership: 322238
## Date: 2015-12-29 00:00:00, Type: W, Total Ridership: 470650
## Date: 2015-11-29 00:00:00, Type: U, Total Ridership: 255475
## Date: 2015-12-30 00:00:00, Type: W, Total Ridership: 482195
## Date: 2015-11-30 00:00:00, Type: W, Total Ridership: 622425
## Date: 2015-12-31 00:00:00, Type: W, Total Ridership: 466078
## Date: 2015-12-01 00:00:00, Type: W, Total Ridership: 654723
## 2017-08-16T06:22:48-07:00
Chapter 5 - Answering Data Science Questions
Counting within Date Ranges - data set is crime data for Chicago:
Dictionaries with Time Windows for Keys - crimes by district and differences by block:
Final thoughts - learned the fundamentals of data types.
Example code includes:
myPath = "./PythonInputFiles/"
# Downloaded 2015 crime data for districts 001, 016, and 019 from
# https://data.cityofchicago.org/Public-Safety/Crimes-2015/vwwp-7yr9
# File is in myPath + "Chicago_Crime_2015_001_016_019.csv"
# Import the csv module
import csv
# Create the file object: csvfile
csvfile = open(myPath + "Chicago_Crime_2015_001_016_019.csv", "r")
# Create an empty list: crime_data
crime_data = []
# Loop over a csv reader on the file object
for row in csv.reader(csvfile):
# Append the date, type of crime, location description, and arrest
crime_data.append((row[2], row[5], row[7], row[8]))
# crime_data.append((row[0], row[2], row[4], row[5]))
# Remove the first element from crime_data
crime_data.pop(0)
# Print the first 10 records
print(crime_data[:10])
# Import necessary modules
from collections import Counter
from datetime import datetime
# Create a Counter Object: crimes_by_month
crimes_by_month = Counter()
# Loop over the crime_data list
for x in crime_data:
# Convert the first element of each item into a Python Datetime Object: date
date = datetime.strptime(x[0], '%m/%d/%Y %I:%M:%S %p')
# Increment the counter for the month of the row by one
crimes_by_month[date.month] += 1
# Print the 3 most common months for crime
print(crimes_by_month.most_common(3))
# Import necessary modules
from collections import defaultdict
from datetime import datetime
# Create a dictionary that defaults to a list: locations_by_month
locations_by_month = defaultdict(list)
# Loop over the crime_data list
for row in crime_data:
# Convert the first element to a date object
date = datetime.strptime(row[0], '%m/%d/%Y %I:%M:%S %p')
# If the year is 2015 (all I have in this data)
if date.year == 2015:
# Set the dictionary key to the month and add the location (third element) to the values list
locations_by_month[date.month].append(row[2])
# Print the dictionary
# print(locations_by_month) # WAY too long!
# Import Counter from collections
from collections import Counter
# Loop over the items from locations_by_month using tuple expansion of the month and locations
for month, locations in locations_by_month.items():
# Make a Counter of the locations
location_count = Counter(locations)
# Print the month
print(month)
# Print the most common location
print(location_count.most_common(5))
# Create the CSV file: csvfile
csvfile = open(myPath + "Chicago_Crime_2015_001_016_019.csv", "r")
# Create a dictionary that defaults to a list: crimes_by_district
crimes_by_district = defaultdict(list)
# Loop over a DictReader of the CSV file
for row in csv.DictReader(csvfile):
# Pop the district from each row: district
district = row.pop("District")
# Append the rest of the data to the list for proper district in crimes_by_district
crimes_by_district[district].append(row)
# Loop over the crimes_by_district using expansion as district and crimes
for district, crimes in crimes_by_district.items():
# Print the district
print(district)
# Create an empty Counter object: year_count
year_count = Counter()
# Loop over the crimes:
for crime in crimes:
# If there was an arrest
if crime['Arrest'] == 'true':
# Convert the Date to a datetime and get the year
year = datetime.strptime(crime["Date"], '%m/%d/%Y %I:%M:%S %p').year
# Increment the Counter for the year
year_count[year] += 1
# Print the counter
print(year_count)
# Create the crims_by_block as a dictionary list
crimes_by_block = defaultdict(list)
# Loop over a DictReader of the CSV file
csvfile = open(myPath + "Chicago_Crime_2015_001_016_019.csv", "r")
for row in csv.DictReader(csvfile):
block = row.pop("Block")
crimeType = row.pop("Primary Type")
crimes_by_block[block].append(crimeType)
# Create a unique list of crimes for the first block: n_state_st_crimes
n_state_st_crimes = set(crimes_by_block['001XX N STATE ST'])
# Print the list
print(n_state_st_crimes)
# Create a unique list of crimes for the second block: w_terminal_st_crimes
w_terminal_st_crimes = set(crimes_by_block['0000X W TERMINAL ST'])
# Print the list
print(w_terminal_st_crimes)
# Find the differences between the two blocks: crime_differences
print(n_state_st_crimes.difference(w_terminal_st_crimes))
print(w_terminal_st_crimes.difference(n_state_st_crimes))
## [('05/19/2015 01:12:00 AM', 'ASSAULT', 'APARTMENT', 'true'), ('06/24/2015 06:00:00 AM', 'NARCOTICS', 'RESIDENCE', 'true'), ('07/10/2015 06:00:00 AM', 'NARCOTICS', 'GOVERNMENT BUILDING/PROPERTY', 'true'), ('08/21/2015 02:26:00 PM', 'NARCOTICS', 'PARKING LOT/GARAGE(NON.RESID.)', 'true'), ('03/19/2015 08:05:00 PM', 'NARCOTICS', 'AIRPORT/AIRCRAFT', 'true'), ('03/26/2015 09:45:00 AM', 'NARCOTICS', 'AIRPORT/AIRCRAFT', 'true'), ('04/17/2015 10:44:00 AM', 'NARCOTICS', 'SIDEWALK', 'true'), ('09/08/2015 06:00:00 AM', 'NARCOTICS', 'GOVERNMENT BUILDING/PROPERTY', 'true'), ('05/11/2015 06:30:00 PM', 'NARCOTICS', 'AIRPORT/AIRCRAFT', 'true'), ('03/01/2015 09:00:00 AM', 'OTHER OFFENSE', 'OTHER', 'false')]
## [(8, 3187), (7, 3090), (10, 2969)]
## 5
## [('STREET', 470), ('RESIDENCE', 284), ('APARTMENT', 193), ('OTHER', 189), ('SIDEWALK', 184)]
## 6
## [('STREET', 574), ('RESIDENCE', 316), ('SIDEWALK', 276), ('APARTMENT', 209), ('OTHER', 188)]
## 7
## [('STREET', 616), ('RESIDENCE', 313), ('SIDEWALK', 280), ('OTHER', 236), ('APARTMENT', 186)]
## 8
## [('STREET', 618), ('RESIDENCE', 331), ('SIDEWALK', 282), ('APARTMENT', 199), ('OTHER', 186)]
## 3
## [('STREET', 475), ('RESIDENCE', 297), ('APARTMENT', 204), ('OTHER', 177), ('SIDEWALK', 172)]
## 4
## [('STREET', 438), ('RESIDENCE', 345), ('APARTMENT', 198), ('OTHER', 181), ('SIDEWALK', 161)]
## 9
## [('STREET', 514), ('RESIDENCE', 295), ('SIDEWALK', 276), ('OTHER', 210), ('APARTMENT', 187)]
## 11
## [('STREET', 482), ('RESIDENCE', 260), ('APARTMENT', 212), ('OTHER', 200), ('RESTAURANT', 157)]
## 12
## [('STREET', 547), ('RESIDENCE', 364), ('APARTMENT', 232), ('OTHER', 188), ('RESTAURANT', 162)]
## 1
## [('STREET', 416), ('RESIDENCE', 345), ('OTHER', 191), ('APARTMENT', 187), ('RESTAURANT', 125)]
## 2
## [('STREET', 317), ('RESIDENCE', 271), ('APARTMENT', 165), ('OTHER', 153), ('PARKING LOT/GARAGE(NON.RESID.)', 86)]
## 10
## [('STREET', 534), ('RESIDENCE', 300), ('SIDEWALK', 226), ('OTHER', 224), ('APARTMENT', 219)]
## 019
## Counter({2015: 2122})
## 016
## Counter({2015: 1853})
## 001
## Counter({2015: 2788})
## {'BATTERY', 'BURGLARY', 'DECEPTIVE PRACTICE', 'PUBLIC PEACE VIOLATION', 'NARCOTICS', 'ROBBERY', 'CRIMINAL DAMAGE', 'CRIMINAL TRESPASS', 'LIQUOR LAW VIOLATION', 'OTHER OFFENSE', 'ASSAULT', 'THEFT'}
## {'BURGLARY', 'DECEPTIVE PRACTICE', 'PUBLIC PEACE VIOLATION', 'OFFENSE INVOLVING CHILDREN', 'STALKING', 'NARCOTICS', 'MOTOR VEHICLE THEFT', 'ROBBERY', 'CRIMINAL DAMAGE', 'SEX OFFENSE', 'CRIMINAL TRESPASS', 'OTHER OFFENSE', 'WEAPONS VIOLATION', 'BATTERY', 'THEFT', 'ASSAULT', 'NON-CRIMINAL'}
## {'LIQUOR LAW VIOLATION'}
## {'STALKING', 'OFFENSE INVOLVING CHILDREN', 'MOTOR VEHICLE THEFT', 'SEX OFFENSE', 'WEAPONS VIOLATION', 'NON-CRIMINAL'}
Additional Exploration - CTA
Some additional experimentation with the CTA data, including:
Example code includes:
myPath = "./PythonInputFiles/"
# Create stations data from the CSV downloaded from Chicago Open Data
# https://data.cityofchicago.org/Transportation/CTA-Ridership-L-Station-Entries-Daily-Totals/5neh-572f/data
# Filtered the data to download only 2015-2016
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
statRaw = pd.read_csv(myPath + "CTA_Ridership_Station_Entries_Daily_Totals.csv")
statRaw["convDate"] = [datetime.strptime(x, "%m/%d/%Y") for x in statRaw["date"]]
statRaw.head()
# Average daily rides by month
dailyRides = statRaw[["convDate", "rides"]].groupby("convDate").sum()
avgMonthlyRides = dailyRides.resample("M").mean()
print(round(avgMonthlyRides, 0))
avgMonthlyRides.plot()
plt.ylim([0, round(max(avgMonthlyRides["rides"]), -5) + 50000])
plt.title("Average Daily Rides by Month (CTA)")
plt.xlabel("")
plt.ylabel("Average Daily Rides")
# plt.show()
plt.savefig("_dummyPy093.png", bbox_inches="tight")
plt.clf()
# Same axis
convMonthlyRides = avgMonthlyRides.copy()
convMonthlyRides["year"] = convMonthlyRides.index.year
convMonthlyRides["month"] = convMonthlyRides.index.month
convMonthlyRides = convMonthlyRides.pivot_table(index="month", values="rides", columns="year", aggfunc=sum)
convMonthlyRides.plot()
plt.ylim([0, round(max(avgMonthlyRides["rides"]), -5) + 50000])
plt.title("Average Daily Rides by Month (CTA)")
plt.xlabel("Month")
plt.ylabel("Average Daily Rides")
# plt.show()
plt.savefig("_dummyPy094.png", bbox_inches="tight")
plt.clf()
# Average daily rides by daytype
typeRides = statRaw[["daytype", "convDate", "rides"]].groupby(["convDate", "daytype"]).sum()
print(round(typeRides.groupby("daytype").mean(), 0))
typeRides.groupby("daytype").mean().plot(kind="bar")
plt.title("Average Daily Rides by Day Type 2015-2016 (CTA)")
plt.xlabel("Day Type (A=Sat, U=Sun/Hol, W=Weekday)")
plt.ylabel("Average Daily Rides")
# plt.show()
plt.savefig("_dummyPy095.png", bbox_inches="tight")
plt.clf()
# Average daily rides by station
stationRides = statRaw[["stationname", "rides"]].groupby(["stationname"]).mean().sort_values("rides", ascending=False)
print(round(stationRides.iloc[:20, :], 0))
print(round(stationRides.iloc[-20:, :], 0))
stationRides.plot(kind="bar")
plt.title("Average Daily Rides by Station 2015-2016 (CTA)")
plt.xticks([])
plt.ylim([0, round(max(stationRides["rides"]), -4) + 5000])
plt.xlabel("Stations Sorted by Descending Rides")
plt.ylabel("Average Daily Rides")
# plt.show()
plt.savefig("_dummyPy096.png", bbox_inches="tight")
plt.clf()
import numpy as np
# Average daily rides by daytype by station
daytypeRides = statRaw.pivot_table(index="stationname", values="rides", columns="daytype", aggfunc=np.mean)
print(round(daytypeRides.loc[stationRides.iloc[:20, :].index, :], 0))
print(round(daytypeRides.loc[stationRides.iloc[-20:, :].index, :], 0))
# Deviation from average by daytype
daytypeRides["totMean"] = stationRides.loc[daytypeRides.index, "rides"]
ratA = daytypeRides["A"] / daytypeRides["totMean"]
ratU = daytypeRides["U"] / daytypeRides["totMean"]
ratW = daytypeRides["W"] / daytypeRides["totMean"]
print(round(ratA.sort_values(ascending=False)[0:10], 3))
print(round(ratU.sort_values(ascending=False)[0:10], 3))
print(round(ratW.sort_values(ascending=False)[0:10], 3))
print(round(ratA.sort_values(ascending=False)[-10:], 3))
print(round(ratU.sort_values(ascending=False)[-10:], 3))
ratW.sort_values(ascending=False).plot()
ratA.sort_values(ascending=False).plot()
ratU.sort_values(ascending=False).plot()
plt.ylim([0, 1.5])
plt.xticks([])
plt.title("Percentage of Average Daily Rides by Day Type")
plt.xlabel("Station - Sorted Independently for Each Day Type")
plt.ylabel("% of Daily Average Rides on Day Type")
plt.legend(["W (Weekday)", "A (Saturday)", "U (Sun/Hol)"])
# plt.show()
plt.savefig("_dummyPy097.png", bbox_inches="tight")
plt.clf()
# Greatest consistency and inconsistency by station and daytype
statDayType = pd.DataFrame( {"ratW":ratW, "ratA":ratA, "ratU":ratU} )[["ratW", "ratA", "ratU"]]
statDayType["STD"] = statDayType[["ratW", "ratA", "ratU"]].apply(np.std, axis=1)
print(round(statDayType.sort_values("STD", ascending=False).iloc[:20, :], 3))
print(round(statDayType.sort_values("STD", ascending=True).iloc[:20, :], 3))
statDayType.sort_values("STD", ascending=False).plot()
plt.xticks([])
plt.xlabel("Station - Sorted by Decreasing Consistency by Day Type")
plt.legend(["Weekday", "Sat", "Sun/Hol", "Deviation"])
# plt.show()
plt.savefig("_dummyPy098.png", bbox_inches="tight")
plt.clf()
# statAU = statDayType[["ratA", "ratU"]]
# statAU["Delta"] = (statAU["ratA"] - statAU["ratU"]) / (statAU["ratA"] + statAU["ratU"])
# print(round(statAU.sort_values("Delta", ascending=False).iloc[:20, :], 3))
# print(round(statAU.sort_values("Delta", ascending=True).iloc[:20, :], 3))
# statAU.sort_values("Delta", ascending=False).plot()
# plt.xticks([])
# plt.show()
# Greatest seasonality by station
# Use month as a surrogate for season, and compare percent by month to system totals
statMonth = [x.month for x in statRaw["convDate"]]
miniStation = statRaw[["stationname", "rides"]]
miniStation["month"] = statMonth
miniPivot = miniStation.pivot_table(index="stationname", values="rides", columns="month", aggfunc=sum)
miniColSum = miniPivot.apply(sum, axis=0)
miniRowSum = miniPivot.apply(sum, axis=1)
benchPct = miniColSum / sum(miniColSum)
miniPct = miniPivot.copy()
for x in miniPct.columns:
miniPct[x] = miniPct[x] / miniRowSum
miniDev = [sum((miniPct.loc[x, :] - benchPct) ** 2) ** 0.5 for x in miniPct.index]
miniPct["Deviation"] = miniDev
topDev = miniPct.sort_values("Deviation", ascending=False)
del miniPct["Deviation"]
topDev.loc[:, "Deviation"].plot()
plt.xticks([])
plt.title("Station Seasonality vs. System Seasonality (RMSE)")
plt.xlabel("Station")
plt.ylabel("RMSE")
# plt.show()
plt.savefig("_dummyPy099.png", bbox_inches="tight")
plt.clf()
print(topDev.iloc[0:20, :])
benchPct.plot()
plt.ylim([0.025, 0.175])
# Skip the station that closed mid-year (Madison/Wabash) - use index 1, 2, 3, 4, 5 only
for a in topDev.index[1:6]:
miniPct.loc[a, :].plot()
plt.legend(["System Average", topDev.index[1], topDev.index[2], topDev.index[3], topDev.index[4], topDev.index[5]], loc="upper center")
plt.title("Stations with Greatest Seasonality vs. System (RMSE)")
plt.xlabel("Month")
plt.ylabel("% of Annual Rides in Month")
# plt.show()
plt.savefig("_dummyPy100.png", bbox_inches="tight")
plt.clf()
# Patterns by day of week
# Break weekday in to M/Tu/We/Th/F and eliminate weekday holidays
testStation = statRaw.copy()
testStation["weekday"] = [x.weekday() for x in testStation["convDate"]]
testStation["weekday"].value_counts()
testStation.groupby(["daytype", "weekday"]).count()
myBool = (testStation["daytype"] != "U") | (testStation["weekday"] == 6)
useStation = testStation.loc[myBool, :]
print(useStation.groupby(["daytype", "weekday"]).count())
a = useStation[["weekday", "rides", "convDate"]].groupby(["convDate", "weekday"]).sum().groupby("weekday").mean()
print(a)
a.plot(kind="bar")
plt.xlabel("")
plt.ylabel("Average Rides per Day")
plt.title("Average Rides per Day by Day of Week (CTA 2015-2016)")
plt.xticks(np.arange(7), ["Mon", "Tues", "Wed", "Thurs", "Fri", "Sat", "Sun"], rotation=0)
# plt.show()
plt.savefig("_dummyPy101.png", bbox_inches="tight")
plt.clf()
workDay = useStation.loc[useStation["daytype"] == "W", :].pivot_table(index="stationname", values="rides", columns="weekday", aggfunc=np.mean)
workDay["STD"] = [np.sqrt(sum( (workDay.loc[b, :] / sum(workDay.loc[b, :]) - 0.2) ** 2 )) for b in workDay.index]
workDay.sort_values("STD", ascending=False)["STD"].plot(kind="bar")
plt.xticks([])
plt.xlabel("Stations sorted from Least to Most Consistent")
plt.ylabel("Inconsistency (RMSE)")
plt.title("Consistency by Workday and Station (CTA 2015-2016)")
# plt.show()
plt.savefig("_dummyPy102.png", bbox_inches="tight")
plt.clf()
print(workDay.sort_values("STD", ascending=False).iloc[0:6, :])
(workDay.iloc[:, 0:5].apply(sum, axis=0) / sum(workDay.iloc[:, 0:5].apply(sum, axis=0))).plot()
for c in range(4):
d = workDay.sort_values("STD", ascending=False).iloc[c, 0:5]
(d / sum(d)).plot()
plt.xticks(np.arange(5), ["Mon", "Tues", "Wed", "Thurs", "Fri"])
plt.ylim([0.15, 0.25])
plt.xlabel("")
plt.ylabel("Proportion of Workday Rides")
plt.title("Outlier Stations for Workday Ride Patterns (CTA 2015-2016)")
plt.legend()
# plt.show()
plt.savefig("_dummyPy103.png", bbox_inches="tight")
plt.clf()
## rides
## convDate
## 2015-01-31 474145.0
## 2015-02-28 503489.0
## 2015-03-31 530615.0
## 2015-04-30 546817.0
## 2015-05-31 532022.0
## 2015-06-30 575190.0
## 2015-07-31 579919.0
## 2015-08-31 549998.0
## 2015-09-30 592430.0
## 2015-10-31 601043.0
## 2015-11-30 532817.0
## 2015-12-31 491164.0
## 2016-01-31 478600.0
## 2016-02-29 524561.0
## 2016-03-31 537988.0
## 2016-04-30 538034.0
## 2016-05-31 535405.0
## 2016-06-30 569531.0
## 2016-07-31 542375.0
## 2016-08-31 544725.0
## 2016-09-30 569900.0
## 2016-10-31 576956.0
## 2016-11-30 544092.0
## 2016-12-31 451587.0
## rides
## daytype
## A 381327.0
## U 288387.0
## W 627657.0
## rides
## stationname
## Lake/State 19186.0
## Clark/Lake 16374.0
## Chicago/State 14399.0
## Grand/State 11836.0
## Belmont-North Main 11830.0
## Fullerton 11399.0
## O'Hare Airport 11015.0
## Roosevelt 10427.0
## Washington/Dearborn 10176.0
## 95th/Dan Ryan 9774.0
## Monroe/State 9495.0
## Jackson/State 9147.0
## State/Lake 8813.0
## Addison-North Main 8477.0
## Randolph/Wabash 8185.0
## Midway Airport 7739.0
## Adams/Wabash 7656.0
## Clark/Division 7270.0
## 79th 6664.0
## Jackson/Dearborn 6459.0
## rides
## stationname
## Pulaski-Cermak 1068.0
## Western-Cermak 1057.0
## Harlem-Forest Park 1049.0
## Kedzie-Cermak 1003.0
## California-Lake 968.0
## 51st 947.0
## 43rd 941.0
## Linden 879.0
## Conservatory 865.0
## Dempster 800.0
## Foster 784.0
## Indiana 778.0
## Noyes 732.0
## Central-Evanston 715.0
## South Boulevard 683.0
## Halsted/63rd 628.0
## Oakton-Skokie 583.0
## King Drive 555.0
## Madison/Wabash 540.0
## Kostner 462.0
## daytype A U W
## stationname
## Lake/State 14223.0 10200.0 22252.0
## Clark/Lake 6846.0 5466.0 20817.0
## Chicago/State 13201.0 9736.0 15706.0
## Grand/State 12120.0 9363.0 12340.0
## Belmont-North Main 10877.0 8334.0 12821.0
## Fullerton 9079.0 6611.0 12965.0
## O'Hare Airport 9443.0 10300.0 11501.0
## Roosevelt 9586.0 7664.0 11229.0
## Washington/Dearborn 6398.0 4699.0 12199.0
## 95th/Dan Ryan 7023.0 5529.0 11306.0
## Monroe/State 5298.0 3842.0 11645.0
## Jackson/State 4968.0 3714.0 11243.0
## State/Lake 6201.0 4463.0 10341.0
## Addison-North Main 8889.0 7147.0 8695.0
## Randolph/Wabash 5116.0 3521.0 9877.0
## Midway Airport 4827.0 4195.0 9145.0
## Adams/Wabash 4462.0 3298.0 9305.0
## Clark/Division 6667.0 5185.0 7869.0
## 79th 5298.0 4260.0 7491.0
## Jackson/Dearborn 3387.0 2645.0 7958.0
## daytype A U W
## stationname
## Pulaski-Cermak 804.0 613.0 1226.0
## Western-Cermak 789.0 597.0 1217.0
## Harlem-Forest Park 709.0 505.0 1243.0
## Kedzie-Cermak 780.0 589.0 1144.0
## California-Lake 656.0 522.0 1133.0
## 51st 744.0 542.0 1081.0
## 43rd 649.0 503.0 1100.0
## Linden 749.0 552.0 979.0
## Conservatory 686.0 533.0 977.0
## Dempster 728.0 557.0 870.0
## Foster 592.0 429.0 905.0
## Indiana 513.0 436.0 911.0
## Noyes 537.0 365.0 855.0
## Central-Evanston 650.0 323.0 818.0
## South Boulevard 457.0 327.0 811.0
## Halsted/63rd 433.0 325.0 737.0
## Oakton-Skokie 344.0 238.0 711.0
## King Drive 430.0 344.0 629.0
## Madison/Wabash 415.0 202.0 643.0
## Kostner 309.0 242.0 543.0
## stationname
## Cermak-Chinatown 1.093
## Addison-North Main 1.049
## Grand/State 1.024
## North/Clybourn 0.928
## Belmont-North Main 0.919
## Roosevelt 0.919
## Clark/Division 0.917
## Chicago/State 0.917
## Cermak-McCormick Place 0.910
## Dempster 0.910
## dtype: float64
## stationname
## O'Hare Airport 0.935
## Cermak-Chinatown 0.881
## Addison-North Main 0.843
## Grand/State 0.791
## Roosevelt 0.735
## Clark/Division 0.713
## Pulaski-Forest Park 0.708
## Belmont-North Main 0.704
## Dempster 0.696
## North/Clybourn 0.695
## dtype: float64
## stationname
## LaSalle/Van Buren 1.341
## Washington/Wells 1.338
## Merchandise Mart 1.329
## Quincy/Wells 1.322
## Polk 1.304
## Chicago/Franklin 1.286
## Medical Center 1.275
## Monroe/Dearborn 1.272
## Clinton-Lake 1.272
## Clark/Lake 1.271
## dtype: float64
## stationname
## Medical Center 0.433
## Chicago/Franklin 0.431
## Clinton-Lake 0.431
## Clark/Lake 0.418
## Monroe/Dearborn 0.413
## Polk 0.350
## Merchandise Mart 0.300
## Quincy/Wells 0.282
## Washington/Wells 0.259
## LaSalle/Van Buren 0.249
## dtype: float64
## stationname
## Monroe/Dearborn 0.333
## UIC-Halsted 0.329
## Clinton-Lake 0.317
## Medical Center 0.304
## Chicago/Franklin 0.256
## Polk 0.250
## Quincy/Wells 0.234
## Merchandise Mart 0.188
## Washington/Wells 0.186
## LaSalle/Van Buren 0.180
## dtype: float64
## ratW ratA ratU STD
## stationname
## LaSalle/Van Buren 1.341 0.249 0.180 0.532
## Washington/Wells 1.338 0.259 0.186 0.527
## Merchandise Mart 1.329 0.300 0.188 0.513
## Quincy/Wells 1.322 0.282 0.234 0.502
## Polk 1.304 0.350 0.250 0.475
## Chicago/Franklin 1.286 0.431 0.256 0.450
## Medical Center 1.275 0.433 0.304 0.431
## Clinton-Lake 1.272 0.431 0.317 0.426
## Monroe/Dearborn 1.272 0.413 0.333 0.425
## Clark/Lake 1.271 0.418 0.334 0.423
## UIC-Halsted 1.266 0.449 0.329 0.416
## Ridgeland 1.253 0.509 0.334 0.398
## Oak Park-Forest Park 1.247 0.502 0.367 0.387
## Clinton-Forest Park 1.234 0.478 0.443 0.365
## Jackson/Dearborn 1.232 0.524 0.410 0.364
## Racine 1.230 0.540 0.406 0.361
## Jackson/State 1.229 0.543 0.406 0.360
## Wellington 1.226 0.573 0.391 0.359
## Pulaski-Orange 1.226 0.563 0.400 0.357
## Armitage 1.223 0.600 0.381 0.357
## ratW ratA ratU STD
## stationname
## O'Hare Airport 1.044 0.857 0.935 0.077
## Cermak-Chinatown 1.008 1.093 0.881 0.087
## Addison-North Main 1.026 1.049 0.843 0.092
## Grand/State 1.043 1.024 0.791 0.114
## Roosevelt 1.077 0.919 0.735 0.140
## Clark/Division 1.082 0.917 0.713 0.151
## Belmont-North Main 1.084 0.919 0.704 0.155
## North/Clybourn 1.084 0.928 0.695 0.160
## Dempster 1.088 0.910 0.696 0.160
## Pulaski-Forest Park 1.097 0.851 0.708 0.160
## Laramie 1.094 0.889 0.687 0.166
## Chicago/State 1.091 0.917 0.676 0.170
## Jarvis 1.101 0.869 0.675 0.174
## Cermak-McCormick Place 1.096 0.910 0.660 0.179
## Argyle 1.106 0.849 0.670 0.179
## Sox-35th-Dan Ryan 1.107 0.842 0.671 0.179
## Morse 1.109 0.839 0.668 0.181
## Harrison 1.099 0.898 0.655 0.182
## Lawrence 1.108 0.848 0.660 0.184
## Granville 1.104 0.891 0.642 0.189
## month 1 2 3 4 5 \
## stationname
## Madison/Wabash 0.398988 0.385555 0.214910 0.000000 0.000000
## Oakton-Skokie 0.096158 0.097952 0.111193 0.111125 0.085027
## Dempster-Skokie 0.094221 0.092433 0.106116 0.110728 0.091168
## UIC-Halsted 0.078581 0.093692 0.091518 0.096727 0.056781
## Addison-North Main 0.055724 0.053556 0.064407 0.083002 0.099099
## Cermak-McCormick Place 0.032409 0.062367 0.079548 0.080936 0.088567
## Linden 0.061200 0.058489 0.067087 0.072109 0.091219
## Fullerton 0.085145 0.082689 0.086284 0.092141 0.093940
## California-Cermak 0.073007 0.071432 0.080919 0.078071 0.080125
## Sox-35th-Dan Ryan 0.067474 0.066771 0.076469 0.085913 0.094846
## Racine 0.082383 0.081751 0.095961 0.082130 0.090382
## Laramie 0.076938 0.073433 0.082686 0.098942 0.101857
## Harrison 0.067872 0.078004 0.084941 0.089019 0.085421
## Central-Evanston 0.072770 0.070955 0.076965 0.074477 0.075629
## Grand/State 0.069566 0.064770 0.079480 0.079286 0.086596
## Noyes 0.080514 0.081245 0.081505 0.086206 0.086941
## Jackson/State 0.079003 0.080189 0.086028 0.090937 0.087688
## O'Hare Airport 0.066802 0.061858 0.076246 0.078602 0.090151
## Medical Center 0.079218 0.080000 0.091860 0.085438 0.078145
## Loyola 0.076656 0.081891 0.086326 0.086079 0.075960
##
## month 6 7 8 9 10 \
## stationname
## Madison/Wabash 0.000000 0.000000 0.000000 0.000000 0.000000
## Oakton-Skokie 0.058680 0.057094 0.059329 0.062134 0.065861
## Dempster-Skokie 0.060479 0.062203 0.064026 0.061321 0.065587
## UIC-Halsted 0.055199 0.056054 0.074511 0.115300 0.120902
## Addison-North Main 0.101552 0.109265 0.111560 0.104275 0.100128
## Cermak-McCormick Place 0.090257 0.101195 0.088459 0.096139 0.118213
## Linden 0.100439 0.111700 0.107350 0.093748 0.097619
## Fullerton 0.079906 0.072579 0.069316 0.096725 0.105022
## California-Cermak 0.083142 0.083541 0.083823 0.125757 0.087759
## Sox-35th-Dan Ryan 0.093067 0.100424 0.097194 0.094181 0.087019
## Racine 0.086149 0.069863 0.073575 0.089183 0.095642
## Laramie 0.082131 0.081475 0.080246 0.083775 0.087000
## Harrison 0.077955 0.087137 0.071646 0.094071 0.105191
## Central-Evanston 0.088274 0.088917 0.087022 0.103993 0.099941
## Grand/State 0.091709 0.103065 0.096821 0.086325 0.090692
## Noyes 0.083811 0.085791 0.075373 0.083252 0.100941
## Jackson/State 0.082838 0.083690 0.077902 0.092210 0.098377
## O'Hare Airport 0.089674 0.095615 0.093757 0.091446 0.095852
## Medical Center 0.081168 0.075943 0.077842 0.092896 0.098200
## Loyola 0.078668 0.079758 0.081569 0.097936 0.097516
##
## month 11 12 Deviation
## stationname
## Madison/Wabash 0.000000 0.000547 0.533109
## Oakton-Skokie 0.104118 0.091329 0.085198
## Dempster-Skokie 0.100770 0.090949 0.077561
## UIC-Halsted 0.103581 0.057152 0.076466
## Addison-North Main 0.067443 0.049991 0.061739
## Cermak-McCormick Place 0.091843 0.070068 0.054797
## Linden 0.074121 0.064920 0.047597
## Fullerton 0.082711 0.053544 0.039950
## California-Cermak 0.079249 0.073173 0.039019
## Sox-35th-Dan Ryan 0.073028 0.063615 0.029337
## Racine 0.080634 0.072346 0.028373
## Laramie 0.077736 0.073780 0.027801
## Harrison 0.087984 0.070757 0.025257
## Central-Evanston 0.089313 0.071745 0.023600
## Grand/State 0.078699 0.072990 0.023107
## Noyes 0.090357 0.064064 0.022523
## Jackson/State 0.080978 0.060161 0.022092
## O'Hare Airport 0.084282 0.075715 0.021967
## Medical Center 0.085717 0.073572 0.021767
## Loyola 0.087161 0.070479 0.021431 -c:136: SettingWithCopyWarning:
## A value is trying to be set on a copy of a slice from a DataFrame.
## Try using .loc[row_indexer,col_indexer] = value instead
##
## See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
##
## station_id stationname date rides convDate
## daytype weekday
## A 5 15120 15120 15120 15120 15120
## U 6 14976 14976 14976 14976 14976
## W 0 14112 14112 14112 14112 14112
## 1 14976 14976 14976 14976 14976
## 2 14976 14976 14976 14976 14976
## 3 14688 14688 14688 14688 14688
## 4 14688 14688 14688 14688 14688
## rides
## weekday
## 0 605158.030612
## 1 631270.884615
## 2 630407.423077
## 3 636834.637255
## 4 633605.245098
## 5 381326.885714
## 6 290429.932692
## weekday 0 1 2 3 \
## stationname
## Addison-North Main 7894.020408 8665.096154 8586.673077 8392.549020
## O'Hare Airport 11539.438776 10534.442308 10611.769231 11961.754902
## Grand/State 11465.602041 11898.673077 12045.798077 12363.490196
## Jackson/State 11220.979592 11798.278846 11690.461538 11683.372549
## Madison/Wabash 597.040816 637.326923 632.923077 637.245098
## Cermak-Chinatown 4068.969388 4158.807692 4150.019231 4249.205882
##
## weekday 4 STD
## stationname
## Addison-North Main 9908.529412 0.034280
## O'Hare Airport 12895.421569 0.034244
## Grand/State 13908.558824 0.030359
## Jackson/State 9801.088235 0.029682
## Madison/Wabash 706.705882 0.024785
## Cermak-Chinatown 4721.176471 0.024392
Average Daily Rides by Month - Chicago Train (CTA) 2015-2016:
Average Daily Rides by Month - 2015 vs 2016:
Average Daily Rides by Day Type - (CTA 2015-2016):
Average Daily Rides by Station - (CTA 2015-2016):
Average Daily Rides by Station and Day Type - (CTA 2015-2016):
Consistency of Average Rides by Station and Day Type - (CTA 2015-2016):
Seasonality of Average Rides by Station - (CTA 2015-2016):
Stations with Greatest Seasonality of Average Rides - (CTA 2015-2016):
Average Daily Rides by Day of Week - (CTA 2015-2016):
Consistency by Station of Average Daily Rides by Day of Week - (CTA 2015-2016):
Stations with Greatest Difference from System Average Rides by Day of Week - (CTA 2015-2016):
Additional Exploration - Chicago Crimes
Some additional experimentation with the Chicago Crime data, including:
Example code includes:
myPath = "./PythonInputFiles/"
# Chicago Open Data crime database - filtered for 2015 only and districts 001, 016, and 019
# https://data.cityofchicago.org/Public-Safety/Crimes-2015/vwwp-7yr9
# File is in myPath + "Chicago_Crime_2015_001_016_019.csv"
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
import numpy as np
rawCrime = pd.read_csv(myPath + "Chicago_Crime_2015_001_016_019.csv")
filtCrime = rawCrime[["Date", "Block", "Primary Type", "Description", "Location Description", "Arrest", "District", "Beat", "Ward", "Community Area"]]
filtCrime["convDate"] = [datetime.strptime(x.split()[0], "%m/%d/%Y") for x in filtCrime["Date"]]
# Total crime by day and month
dateCrime = filtCrime[["convDate", "Block"]].groupby("convDate").count()
dateCrime.plot()
plt.ylim([0, 10 * round(max(dateCrime["Block"]) / 10, 0) + 10])
plt.xlabel("")
plt.title("Chicago Crimes by Day in 2015 \n(Districts 001, 016, 019)")
# plt.show()
plt.savefig("_dummyPy104.png", bbox_inches="tight")
plt.clf()
dateCrime.resample("M").sum().plot(kind="bar")
plt.xticks(np.arange(12), ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"], rotation=0)
plt.xlabel("")
plt.title("Chicago Crimes by Month in 2015 \n(Districts 001, 016, 019)")
# plt.show()
plt.savefig("_dummyPy105.png", bbox_inches="tight")
plt.clf()
# Total crime by type and District
typeCrime = filtCrime.pivot_table(index="Primary Type", columns="District", values="Block", aggfunc=len).fillna(0)
typeCrime["Total"] = typeCrime.apply(sum, axis=1)
print(typeCrime.sort_values("Total", ascending=False).iloc[0:20, :])
# Clearance Rate by Crime Type
arrestCrime = filtCrime[["Primary Type", "Arrest"]].pivot_table(index="Primary Type", columns="Arrest", aggfunc=len).fillna(0)
arrestCrime["Total"] = arrestCrime.apply(sum, axis=1)
arrestCrime["Clear"] = arrestCrime[True] / arrestCrime["Total"]
arrestCrime = arrestCrime.sort_values("Total", ascending=False)
print(arrestCrime.iloc[0:20, :])
nPlot = 12
fig, ax1 = plt.subplots()
(arrestCrime["Total"][0:nPlot]/1000).plot(kind="bar")
plt.title("Chicago Crimes and Clearance Rate in 2015 \n(Districts 001, 016, 019)")
plt.xlabel("Crime Type")
xTickNewLine = [x.capitalize().replace(" ", "\n") for x in arrestCrime.index]
plt.xticks(np.arange(nPlot), xTickNewLine[0:nPlot], fontsize=9, rotation=90)
ax1.set_ylabel("Total Crimes (000)", color="b")
ax1.tick_params("y", colors="b")
ax2 = plt.twinx()
ax2.plot(list(arrestCrime["Clear"][0:nPlot]), "r-")
ax2.set_ylabel("Clearance Rate", color="r")
ax2.tick_params("y", colors="r")
plt.tight_layout()
# plt.show()
plt.savefig("_dummyPy106.png", bbox_inches="tight")
plt.clf()
# Chicago Crimes Cleared
nPlot = 12
(arrestCrime.sort_values(True, ascending=False)[True][0:nPlot]/1000).plot(kind="bar")
plt.title("Chicago Crimes Cleared in 2015 \n(Districts 001, 016, 019)")
plt.xlabel("Crime Type")
xTickNewLine = [x.capitalize().replace(" ", "\n") for x in arrestCrime.sort_values(True, ascending=False).index]
plt.xticks(np.arange(nPlot), xTickNewLine[0:nPlot], fontsize=9, rotation=90)
plt.ylabel("Total Crimes Cleared (000)")
plt.tight_layout()
# plt.show()
plt.savefig("_dummyPy107.png", bbox_inches="tight")
plt.clf()
# Total crime by location description
locCrime = filtCrime["Location Description"].value_counts()
print(locCrime[0:20])
print(locCrime[0:20].cumsum() / sum(locCrime))
nPlot=15
(locCrime[0:nPlot].cumsum() / sum(locCrime)).plot(kind="bar")
plt.ylim([0, 1])
plt.ylabel("Cumulative percentage of locations")
plt.xlabel("")
plt.title("ECDF for crime locations - Chicago 2015\n(Districts 001, 016, 019)")
xTickNewLine = [x[0:20].capitalize().replace(" ", "\n") for x in locCrime.index]
plt.xticks(np.arange(nPlot), xTickNewLine[0:nPlot], fontsize=8, rotation=90)
plt.tight_layout()
# plt.show()
plt.savefig("_dummyPy108.png", bbox_inches="tight")
plt.clf()
# Total crime by location description and district
locDistCrime = filtCrime[["Location Description", "District"]].pivot_table(index="Location Description", columns="District", aggfunc=len).fillna(0)
locDistCrime["Total"] = locDistCrime.apply(sum, axis=1)
locDistCrime = locDistCrime.sort_values("Total", ascending=False)
nPlot = 15
fig, ax1 = plt.subplots()
(locDistCrime["Total"][0:nPlot]).plot(kind="bar", color="b", alpha=0.5)
plt.title("Chicago Crime Locations by District in 2015 \n(Districts 001, 016, 019)")
plt.xlabel("Location Description")
xTickNewLine = [x[0:20].capitalize().replace(" ", "\n") for x in locDistCrime.index]
plt.xticks(np.arange(nPlot), xTickNewLine[0:nPlot], fontsize=8, rotation=90)
ax1.set_ylabel("Total Crimes", color="b")
ax1.tick_params("y", colors="b")
ax2 = plt.twinx()
ax2.plot(list((locDistCrime[1]/locDistCrime["Total"])[0:nPlot]), "r-")
ax2.plot(list((locDistCrime[16]/locDistCrime["Total"])[0:nPlot]), "g-")
ax2.plot(list((locDistCrime[19]/locDistCrime["Total"])[0:nPlot]), "y-")
ax2.set_ylim([0, 1])
ax2.set_ylabel("Proportion by District")
plt.legend(["001", "016", "019"])
plt.tight_layout()
# plt.show()
plt.savefig("_dummyPy109.png", bbox_inches="tight")
plt.clf()
## -c:17: SettingWithCopyWarning:
## A value is trying to be set on a copy of a slice from a DataFrame.
## Try using .loc[row_indexer,col_indexer] = value instead
##
## See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
## District 1 16 19 Total
## Primary Type
## THEFT 5651.0 2192.0 4177.0 12020.0
## BATTERY 1313.0 1406.0 1600.0 4319.0
## DECEPTIVE PRACTICE 1431.0 771.0 1069.0 3271.0
## CRIMINAL DAMAGE 732.0 1201.0 1169.0 3102.0
## OTHER OFFENSE 471.0 842.0 435.0 1748.0
## ASSAULT 530.0 528.0 490.0 1548.0
## CRIMINAL TRESPASS 596.0 402.0 398.0 1396.0
## BURGLARY 176.0 597.0 597.0 1370.0
## NARCOTICS 219.0 528.0 404.0 1151.0
## MOTOR VEHICLE THEFT 196.0 344.0 416.0 956.0
## ROBBERY 319.0 159.0 371.0 849.0
## PUBLIC PEACE VIOLATION 99.0 92.0 76.0 267.0
## OFFENSE INVOLVING CHILDREN 47.0 88.0 64.0 199.0
## CRIM SEXUAL ASSAULT 28.0 40.0 86.0 154.0
## SEX OFFENSE 49.0 44.0 49.0 142.0
## WEAPONS VIOLATION 23.0 48.0 33.0 104.0
## INTERFERENCE WITH PUBLIC OFFICER 14.0 20.0 26.0 60.0
## LIQUOR LAW VIOLATION 12.0 12.0 26.0 50.0
## PROSTITUTION 24.0 7.0 3.0 34.0
## ARSON 7.0 8.0 19.0 34.0
## Arrest False True Total Clear
## Primary Type
## THEFT 10424.0 1596.0 12020.0 0.132779
## BATTERY 3119.0 1200.0 4319.0 0.277842
## DECEPTIVE PRACTICE 3070.0 201.0 3271.0 0.061449
## CRIMINAL DAMAGE 2909.0 193.0 3102.0 0.062218
## OTHER OFFENSE 1551.0 197.0 1748.0 0.112700
## ASSAULT 1152.0 396.0 1548.0 0.255814
## CRIMINAL TRESPASS 339.0 1057.0 1396.0 0.757163
## BURGLARY 1281.0 89.0 1370.0 0.064964
## NARCOTICS 2.0 1149.0 1151.0 0.998262
## MOTOR VEHICLE THEFT 892.0 64.0 956.0 0.066946
## ROBBERY 737.0 112.0 849.0 0.131920
## PUBLIC PEACE VIOLATION 115.0 152.0 267.0 0.569288
## OFFENSE INVOLVING CHILDREN 171.0 28.0 199.0 0.140704
## CRIM SEXUAL ASSAULT 142.0 12.0 154.0 0.077922
## SEX OFFENSE 99.0 43.0 142.0 0.302817
## WEAPONS VIOLATION 22.0 82.0 104.0 0.788462
## INTERFERENCE WITH PUBLIC OFFICER 3.0 57.0 60.0 0.950000
## LIQUOR LAW VIOLATION 0.0 50.0 50.0 1.000000
## PROSTITUTION 0.0 34.0 34.0 1.000000
## ARSON 25.0 9.0 34.0 0.264706
## STREET 6001
## RESIDENCE 3721
## APARTMENT 2391
## SIDEWALK 2336
## OTHER 2323
## RESTAURANT 1624
## PARKING LOT/GARAGE(NON.RESID.) 1220
## DEPARTMENT STORE 1200
## SMALL RETAIL STORE 1158
## RESIDENCE-GARAGE 720
## GROCERY FOOD STORE 570
## RESIDENCE PORCH/HALLWAY 531
## PARK PROPERTY 525
## HOTEL/MOTEL 482
## ALLEY 475
## BAR OR TAVERN 474
## RESIDENTIAL YARD (FRONT/BACK) 403
## VEHICLE NON-COMMERCIAL 368
## COMMERCIAL / BUSINESS OFFICE 368
## SCHOOL, PUBLIC, BUILDING 352
## Name: Location Description, dtype: int64
## STREET 0.182829
## RESIDENCE 0.296195
## APARTMENT 0.369040
## SIDEWALK 0.440210
## OTHER 0.510983
## RESTAURANT 0.560461
## PARKING LOT/GARAGE(NON.RESID.) 0.597630
## DEPARTMENT STORE 0.634189
## SMALL RETAIL STORE 0.669470
## RESIDENCE-GARAGE 0.691405
## GROCERY FOOD STORE 0.708771
## RESIDENCE PORCH/HALLWAY 0.724949
## PARK PROPERTY 0.740944
## HOTEL/MOTEL 0.755629
## ALLEY 0.770100
## BAR OR TAVERN 0.784541
## RESIDENTIAL YARD (FRONT/BACK) 0.796819
## VEHICLE NON-COMMERCIAL 0.808031
## COMMERCIAL / BUSINESS OFFICE 0.819243
## SCHOOL, PUBLIC, BUILDING 0.829967
## Name: Location Description, dtype: float64
Crimes by Day (Chicago 2015 - Districts 001, 016, 019):
Crimes by Month (Chicago 2015 - Districts 001, 016, 019):
% Crimes Cleared by Crime Type (Chicago 2015 - Districts 001, 016, 019):
Total # Crimes Cleared by Crime Type (Chicago 2015 - Districts 001, 016, 019):
ECDF for Location Descriptions (Chicago 2015 - Districts 001, 016, 019):
Location Descriptions by District (Chicago 2015 - Districts 001, 016, 019):
Chapter 1 - Data Ingestion and Inspection
Plotting multiple graphs - suppose that you have measurements time, Temperature, and DewPoint:
Customizing axes - making the plots less messy and more appealing:
Legends, annotations, and styles:
Example code includes:
year = [1970, 1971, 1972, 1973, 1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011]
physical_sciences = [13.800000000000001, 14.9, 14.800000000000001, 16.5, 18.199999999999999, 19.100000000000001, 20.0, 21.300000000000001, 22.5, 23.699999999999999, 24.600000000000001, 25.699999999999999, 27.300000000000001, 27.600000000000001, 28.0, 27.5, 28.399999999999999, 30.399999999999999, 29.699999999999999, 31.300000000000001, 31.600000000000001, 32.600000000000001, 32.600000000000001, 33.600000000000001, 34.799999999999997, 35.899999999999999, 37.299999999999997, 38.299999999999997, 39.700000000000003, 40.200000000000003, 41.0, 42.200000000000003, 41.100000000000001, 41.700000000000003, 42.100000000000001, 41.600000000000001, 40.799999999999997, 40.700000000000003, 40.700000000000003, 40.700000000000003, 40.200000000000003, 40.100000000000001]
computer_science = [13.6, 13.6, 14.9, 16.399999999999999, 18.899999999999999, 19.800000000000001, 23.899999999999999, 25.699999999999999, 28.100000000000001, 30.199999999999999, 32.5, 34.799999999999997, 36.299999999999997, 37.100000000000001, 36.799999999999997, 35.700000000000003, 34.700000000000003, 32.399999999999999, 30.800000000000001, 29.899999999999999, 29.399999999999999, 28.699999999999999, 28.199999999999999, 28.5, 28.5, 27.5, 27.100000000000001, 26.800000000000001, 27.0, 28.100000000000001, 27.699999999999999, 27.600000000000001, 27.0, 25.100000000000001, 22.199999999999999, 20.600000000000001, 18.600000000000001, 17.600000000000001, 17.800000000000001, 18.100000000000001, 17.600000000000001, 18.199999999999999]
# Import matplotlib.pyplot
import matplotlib.pyplot as plt
# Plot in blue the % of degrees awarded to women in the Physical Sciences
plt.plot(year, physical_sciences, color='blue')
# Plot in red the % of degrees awarded to women in Computer Science
plt.plot(year, computer_science, color='red')
# Display the plot
# plt.show()
plt.savefig("_dummyPy110.png", bbox_inches="tight")
plt.clf()
# Create plot axes for the first line plot
plt.axes([0.05, 0.05, 0.425, 0.9])
# Plot in blue the % of degrees awarded to women in the Physical Sciences
plt.plot(year, physical_sciences, color='blue')
# Create plot axes for the second line plot
plt.axes([0.525, 0.05, 0.425, 0.9])
# Plot in red the % of degrees awarded to women in Computer Science
plt.plot(year, computer_science, color='red')
# Display the plot
# plt.show()
plt.savefig("_dummyPy111.png", bbox_inches="tight")
plt.clf()
# Create a figure with 1x2 subplot and make the left subplot active
plt.subplot(1, 2, 1)
# Plot in blue the % of degrees awarded to women in the Physical Sciences
plt.plot(year, physical_sciences, color='blue')
plt.title('Physical Sciences')
# Make the right subplot active in the current 1x2 subplot grid
plt.subplot(1, 2, 2)
# Plot in red the % of degrees awarded to women in Computer Science
plt.plot(year, computer_science, color='red')
plt.title('Computer Science')
# Use plt.tight_layout() to improve the spacing between subplots
plt.tight_layout()
# plt.show()
plt.savefig("_dummyPy112.png", bbox_inches="tight")
plt.clf()
health = [77.099999999999994, 75.5, 76.900000000000006, 77.400000000000006, 77.900000000000006, 78.900000000000006, 79.200000000000003, 80.5, 81.900000000000006, 82.299999999999997, 83.5, 84.099999999999994, 84.400000000000006, 84.599999999999994, 85.099999999999994, 85.299999999999997, 85.700000000000003, 85.5, 85.200000000000003, 84.599999999999994, 83.900000000000006, 83.5, 83.0, 82.400000000000006, 81.799999999999997, 81.5, 81.299999999999997, 81.900000000000006, 82.099999999999994, 83.5, 83.5, 85.099999999999994, 85.799999999999997, 86.5, 86.5, 86.0, 85.900000000000006, 85.400000000000006, 85.200000000000003, 85.099999999999994, 85.0, 84.799999999999997]
education = [74.535327580000001, 74.149203689999993, 73.554519959999993, 73.501814429999996, 73.336811429999997, 72.801854480000003, 72.166524710000004, 72.456394810000006, 73.192821339999995, 73.821142339999994, 74.981031520000002, 75.845123450000003, 75.843649139999997, 75.950601230000004, 75.869116009999999, 75.923439709999997, 76.143015160000004, 76.963091680000005, 77.627661770000003, 78.111918720000006, 78.866858590000007, 78.991245969999994, 78.435181909999997, 77.267311989999996, 75.814932639999995, 75.125256210000003, 75.035199210000002, 75.163701299999985, 75.486160269999999, 75.838162060000002, 76.692142840000002, 77.375229309999995, 78.644243939999996, 78.544948149999996, 78.65074774, 79.067121729999997, 78.686305509999997, 78.72141311, 79.196326740000003, 79.532908700000007, 79.618624510000004, 79.432811839999999]
# Create a figure with 2x2 subplot layout and make the top left subplot active
plt.subplot(2, 2, 1)
# Plot in blue the % of degrees awarded to women in the Physical Sciences
plt.plot(year, physical_sciences, color='blue')
plt.title('Physical Sciences')
# Make the top right subplot active in the current 2x2 subplot grid
plt.subplot(2, 2, 2)
# Plot in red the % of degrees awarded to women in Computer Science
plt.plot(year, computer_science, color='red')
plt.title('Computer Science')
# Make the bottom left subplot active in the current 2x2 subplot grid
plt.subplot(2, 2, 3)
# Plot in green the % of degrees awarded to women in Health Professions
plt.plot(year, health, color='green')
plt.title('Health Professions')
# Make the bottom right subplot active in the current 2x2 subplot grid
plt.subplot(2, 2, 4)
# Plot in yellow the % of degrees awarded to women in Education
plt.plot(year, education, color='yellow')
plt.title('Education')
# Improve the spacing between subplots and display them
plt.tight_layout()
# plt.show()
plt.savefig("_dummyPy113.png", bbox_inches="tight")
plt.clf()
# Plot the % of degrees awarded to women in Computer Science and the Physical Sciences
plt.plot(year,computer_science, color='red')
plt.plot(year, physical_sciences, color='blue')
# Add the axis labels
plt.xlabel('Year')
plt.ylabel('Degrees awarded to women (%)')
# Set the x-axis range
plt.xlim(1990, 2010)
# Set the y-axis range
plt.ylim(0, 50)
# Add a title and display the plot
plt.title('Degrees awarded to women (1990-2010)\nComputer Science (red)\nPhysical Sciences (blue)')
# plt.show()
plt.savefig("_dummyPy114.png", bbox_inches="tight")
plt.clf()
# Save the image as 'xlim_and_ylim.png'
# plt.savefig("xlim_and_ylim.png")
# Plot in blue the % of degrees awarded to women in Computer Science
plt.plot(year,computer_science, color='blue')
# Plot in red the % of degrees awarded to women in the Physical Sciences
plt.plot(year, physical_sciences,color='red')
# Set the x-axis and y-axis limits
plt.axis([1990, 2010, 0, 50])
# Show the figure
# plt.show()
plt.savefig("_dummyPy115.png", bbox_inches="tight")
plt.clf()
# Save the figure as 'axis_limits.png'
# plt.savefig("axis_limits.png")
# Specify the label 'Computer Science'
plt.plot(year, computer_science, color='red', label='Computer Science')
# Specify the label 'Physical Sciences'
plt.plot(year, physical_sciences, color='blue', label='Physical Sciences')
# Add a legend at the lower center
plt.legend(loc="lower center")
# Add axis labels and title
plt.xlabel('Year')
plt.ylabel('Enrollment (%)')
plt.title('Undergraduate enrollment of women')
# plt.show()
plt.savefig("_dummyPy116.png", bbox_inches="tight")
plt.clf()
# Plot with legend as before
plt.plot(year, computer_science, color='red', label='Computer Science')
plt.plot(year, physical_sciences, color='blue', label='Physical Sciences')
plt.legend(loc='bottom right')
# Compute the maximum enrollment of women in Computer Science: cs_max
# cs_max = computer_science.max()
cs_max = max(computer_science)
# Calculate the year in which there was maximum enrollment of women in Computer Science: yr_max
#yr_max = year[computer_science.argmax()]
yr_max = year[computer_science.index(cs_max)]
# Add a black arrow annotation
plt.annotate("Maximum", xy=(yr_max, cs_max), xytext=(yr_max + 5, cs_max + 5), arrowprops={"facecolor":'black'})
# Add axis labels and title
plt.xlabel('Year')
plt.ylabel('Enrollment (%)')
plt.title('Undergraduate enrollment of women')
# plt.show()
plt.savefig("_dummyPy117.png", bbox_inches="tight")
plt.clf()
# Import matplotlib.pyplot
import matplotlib.pyplot as plt
# Set the style to 'ggplot'
plt.style.use("ggplot")
# Create a figure with 2x2 subplot layout
plt.subplot(2, 2, 1)
# Plot the enrollment % of women in the Physical Sciences
plt.plot(year, physical_sciences, color='blue')
plt.title('Physical Sciences')
# Plot the enrollment % of women in Computer Science
plt.subplot(2, 2, 2)
plt.plot(year, computer_science, color='red')
plt.title('Computer Science')
# Add annotation
cs_max = max(computer_science)
yr_max = year[computer_science.index(cs_max)]
plt.annotate('Maximum', xy=(yr_max, cs_max), xytext=(yr_max-1, cs_max-10), arrowprops=dict(facecolor='black'))
# Plot the enrollmment % of women in Health professions
plt.subplot(2, 2, 3)
plt.plot(year, health, color='green')
plt.title('Health Professions')
# Plot the enrollment % of women in Education
plt.subplot(2, 2, 4)
plt.plot(year, education, color='yellow')
plt.title('Education')
# Improve spacing between subplots and display them
plt.tight_layout()
# plt.show()
plt.savefig("_dummyPy118.png", bbox_inches="tight")
plt.clf()
## C:\Users\Dave\AppData\Local\Programs\Python\PYTHON~1\lib\site-packages\matplotlib\legend.py:326: UserWarning: Unrecognized location "bottom right". Falling back on "best"; valid locations are
## best
## upper right
## upper left
## lower left
## lower right
## right
## center left
## center right
## lower center
## upper center
## center
##
## six.iterkeys(self.codes))))
Example #1: Unlabelled Plot on Single Set of Axes:
Example #2: Subplots on Separate Axes:
Example #3: Subplots on Separate Axes with Titles:
Example #4: Subplots on Separate Axes with Titles:
Example #5: Title and Axis Labels for Two Plots on a Single Set of Axes:
Example #6: Title, Axis Labels, and Legend for Two Plots on a Single Set of Axes:
Example #7: Annotation with Arrow:
Example #8: Subplots in ggplot2 Format with One Subplot Annotated:
Chapter 2 - Plotting 2D Arrays (Raster Data or Bivariate Function Data)
Working with 2D Arrays - reminders about NumPy arrays:
Visualizing bivariate functions - including the “pseudo-color” (plt.pcolor()) calls:
Visualizing bivariate distributions - distributions of 2D points:
Working with images (matrices of intensity values):
Example code includes:
myPath = "./PythonInputFiles/"
# Import numpy and matplotlib.pyplot
import numpy as np
import matplotlib.pyplot as plt
# Generate two 1-D arrays: u, v
u = np.linspace(-2, 2, 41)
v = np.linspace(-1, 1, 21)
# Generate 2-D arrays from u and v: X, Y
X,Y = np.meshgrid(u, v)
# Compute Z based on X and Y
Z = np.sin(3*np.sqrt(X**2 + Y**2))
# Display the resulting image with pcolor()
plt.pcolor(Z)
# plt.show()
plt.savefig("_dummyPy119.png", bbox_inches="tight")
plt.clf()
# Save the figure to 'sine_mesh.png'
# plt.savefig("sine_mesh.png")
u = np.linspace(-2, 2, 101)
v = np.linspace(0, 2, 51)
X,Y = np.meshgrid(u, v)
Z = X**2/8 + Y**2/8
plt.set_cmap("viridis") # bring back to what it looks like DataCamp may be using
# Generate a default contour map of the array Z
plt.subplot(2,2,1)
plt.contour(X, Y, Z)
# Generate a contour map with 20 contours
plt.subplot(2,2,2)
plt.contour(X, Y, Z, 20)
# Generate a default filled contour map of the array Z
plt.subplot(2,2,3)
plt.contourf(X, Y, Z)
# Generate a default filled contour map with 20 contours
plt.subplot(2,2,4)
plt.contourf(X, Y, Z, 20)
# Improve the spacing between subplots
plt.tight_layout()
# Display the figure
# plt.show()
plt.savefig("_dummyPy120.png", bbox_inches="tight")
plt.clf()
# Create a filled contour plot with a color map of 'viridis'
plt.subplot(2,2,1)
plt.contourf(X,Y,Z,20, cmap='viridis')
plt.colorbar()
plt.title('Viridis')
# Create a filled contour plot with a color map of 'gray'
plt.subplot(2,2,2)
plt.contourf(X,Y,Z,20, cmap='gray')
plt.colorbar()
plt.title('Gray')
# Create a filled contour plot with a color map of 'autumn'
plt.subplot(2,2,3)
plt.contourf(X,Y,Z,20, cmap='autumn')
plt.colorbar()
plt.title('Autumn')
# Create a filled contour plot with a color map of 'winter'
plt.subplot(2,2,4)
plt.contourf(X,Y,Z,20, cmap='winter')
plt.colorbar()
plt.title('Winter')
# Improve the spacing between subplots and display them
plt.tight_layout()
# plt.show()
plt.savefig("_dummyPy121.png", bbox_inches="tight")
plt.clf()
mpg = [18.0, 9.0, 36.100000000000001, 18.5, 34.299999999999997, 32.899999999999999, 32.200000000000003, 22.0, 15.0, 17.0, 44.0, 24.5, 32.0, 14.0, 15.0, 13.0, 36.0, 31.0, 32.0, 21.5, 19.0, 17.0, 16.0, 15.0, 23.0, 26.0, 32.0, 24.0, 21.0, 31.300000000000001, 32.700000000000003, 15.0, 23.0, 17.600000000000001, 28.0, 24.0, 14.0, 18.100000000000001, 36.0, 29.0, 35.100000000000001, 36.0, 16.5, 16.0, 29.899999999999999, 31.0, 27.199999999999999, 14.0, 32.100000000000001, 15.0, 12.0, 17.600000000000001, 25.0, 28.399999999999999, 29.0, 30.899999999999999, 20.0, 20.800000000000001, 22.0, 38.0, 31.0, 19.0, 16.0, 25.0, 22.0, 26.0, 13.0, 19.899999999999999, 11.0, 28.0, 15.5, 26.0, 14.0, 12.0, 24.199999999999999, 25.0, 22.5, 26.800000000000001, 23.0, 26.0, 30.699999999999999, 31.0, 27.199999999999999, 21.5, 29.0, 20.0, 13.0, 14.0, 38.0, 13.0, 24.5, 13.0, 25.0, 24.0, 34.100000000000001, 13.0, 44.600000000000001, 20.5, 18.0, 23.199999999999999, 20.0, 24.0, 25.5, 36.100000000000001, 23.0, 24.0, 18.0, 26.600000000000001, 32.0, 20.300000000000001, 27.0, 17.0, 21.0, 13.0, 24.0, 17.0, 39.100000000000001, 14.5, 13.0, 20.199999999999999, 27.0, 35.0, 15.0, 36.399999999999999, 30.0, 31.899999999999999, 26.0, 16.0, 20.0, 18.600000000000001, 14.0, 25.0, 33.0, 14.0, 18.5, 37.200000000000003, 18.0, 44.299999999999997, 18.0, 28.0, 43.399999999999999, 20.600000000000001, 19.199999999999999, 26.399999999999999, 18.0, 28.0, 26.0, 13.0, 25.800000000000001, 28.100000000000001, 13.0, 16.5, 31.5, 24.0, 15.0, 18.0, 33.5, 32.399999999999999, 27.0, 13.0, 31.0, 28.0, 27.199999999999999, 21.0, 19.0, 25.0, 23.0, 19.0, 15.5, 23.899999999999999, 22.0, 29.0, 14.0, 15.0, 27.0, 15.0, 30.5, 25.0, 17.5, 34.0, 38.0, 30.0, 19.800000000000001, 25.0, 21.0, 26.0, 16.5, 18.100000000000001, 46.600000000000001, 21.5, 14.0, 21.600000000000001, 15.5, 20.5, 23.899999999999999, 12.0, 20.199999999999999, 34.399999999999999, 23.0, 24.300000000000001, 19.0, 29.0, 23.5, 34.0, 37.0, 33.0, 18.0, 15.0, 34.700000000000003, 19.399999999999999, 32.0, 34.100000000000001, 33.700000000000003, 20.0, 15.0, 38.100000000000001, 26.0, 27.0, 16.0, 17.0, 13.0, 28.0, 14.0, 31.5, 34.5, 11.0, 16.0, 31.600000000000001, 19.100000000000001, 18.5, 15.0, 18.0, 35.0, 20.199999999999999, 13.0, 31.0, 22.0, 11.0, 33.5, 43.100000000000001, 25.399999999999999, 40.799999999999997, 14.0, 29.800000000000001, 16.0, 20.600000000000001, 18.0, 33.0, 31.800000000000001, 13.0, 20.0, 32.0, 13.0, 23.699999999999999, 19.199999999999999, 37.0, 18.0, 19.0, 32.299999999999997, 18.0, 13.0, 12.0, 36.0, 18.199999999999999, 19.0, 30.0, 15.0, 11.0, 10.0, 16.0, 14.0, 16.899999999999999, 13.0, 25.0, 21.0, 21.100000000000001, 26.0, 28.0, 29.0, 16.0, 26.600000000000001, 19.0, 32.799999999999997, 22.0, 19.0, 31.0, 23.0, 29.5, 17.5, 19.0, 24.0, 14.0, 28.0, 21.0, 22.399999999999999, 36.0, 18.0, 16.199999999999999, 39.399999999999999, 30.0, 18.0, 17.5, 28.800000000000001, 22.0, 34.200000000000003, 30.5, 16.0, 38.0, 41.5, 27.899999999999999, 22.0, 29.800000000000001, 17.699999999999999, 15.0, 14.0, 15.5, 17.5, 12.0, 29.0, 15.5, 35.700000000000003, 26.0, 30.0, 33.799999999999997, 18.0, 13.0, 20.0, 32.399999999999999, 16.0, 27.5, 23.0, 14.0, 17.0, 16.0, 23.0, 24.0, 27.0, 15.0, 27.0, 28.0, 14.0, 33.5, 39.0, 24.0, 26.5, 19.399999999999999, 15.0, 25.5, 14.0, 27.399999999999999, 13.0, 19.0, 17.0, 28.0, 22.0, 30.0, 18.0, 14.0, 22.0, 23.800000000000001, 24.0, 26.0, 26.0, 30.0, 29.0, 14.0, 25.399999999999999, 19.0, 12.0, 20.0, 27.0, 22.300000000000001, 10.0, 19.199999999999999, 26.0, 16.0, 37.299999999999997, 26.0, 20.199999999999999, 13.0, 21.0, 25.0, 20.5, 37.700000000000003, 36.0, 20.0, 37.0, 18.0, 27.0, 29.5, 17.5, 25.100000000000001]
hp = [88, 193, 60, 98, 78, 100, 75, 76, 130, 140, 52, 88, 84, 148, 150, 130, 58, 82, 65, 110, 95, 110, 140, 170, 78, 90, 96, 95, 110, 75, 132, 150, 83, 85, 86, 75, 140, 139, 70, 52, 60, 84, 138, 180, 65, 67, 97, 150, 70, 100, 180, 129, 95, 90, 83, 75, 100, 85, 112, 67, 65, 88, 100, 75, 100, 70, 145, 110, 210, 80, 145, 69, 150, 198, 120, 92, 90, 115, 95, 75, 76, 67, 71, 115, 84, 91, 150, 215, 67, 175, 60, 175, 110, 95, 68, 150, 67, 95, 110, 105, 102, 110, 89, 66, 88, 75, 78, 105, 70, 103, 60, 150, 72, 170, 90, 110, 58, 152, 145, 139, 83, 69, 150, 67, 80, 71, 46, 105, 90, 110, 175, 80, 74, 150, 150, 65, 100, 48, 105, 90, 48, 105, 105, 88, 100, 75, 113, 190, 92, 80, 165, 180, 71, 97, 72, 105, 90, 75, 88, 155, 68, 90, 84, 87, 112, 87, 125, 108, 142, 97, 105, 75, 137, 150, 88, 145, 63, 95, 140, 88, 85, 70, 85, 115, 86, 79, 120, 120, 65, 110, 220, 115, 170, 100, 90, 225, 85, 65, 97, 90, 90, 49, 110, 70, 92, 53, 100, 190, 63, 90, 67, 65, 75, 100, 110, 60, 93, 88, 150, 100, 150, 88, 225, 68, 70, 208, 105, 74, 90, 110, 72, 97, 88, 88, 129, 85, 86, 150, 70, 48, 77, 65, 175, 90, 150, 110, 130, 53, 65, 158, 95, 61, 215, 100, 145, 68, 150, 88, 67, 105, 175, 160, 74, 135, 100, 67, 198, 180, 215, 100, 225, 155, 170, 81, 85, 95, 80, 92, 70, 149, 84, 97, 52, 72, 85, 52, 95, 71, 140, 100, 96, 150, 75, 107, 110, 75, 97, 133, 70, 67, 112, 145, 115, 98, 70, 78, 230, 63, 76, 105, 95, 62, 165, 165, 160, 190, 95, 180, 78, 120, 80, 75, 68, 67, 95, 140, 110, 72, 150, 95, 54, 153, 130, 170, 86, 97, 90, 145, 86, 79, 165, 83, 64, 92, 72, 140, 150, 96, 150, 80, 130, 100, 125, 90, 94, 76, 90, 150, 97, 85, 81, 78, 46, 84, 70, 153, 116, 100, 167, 88, 88, 88, 200, 125, 92, 110, 69, 67, 90, 150, 90, 71, 105, 62, 88, 122, 65, 88, 90, 68, 110, 88]
# Generate a 2-D histogram
plt.hist2d(hp, mpg, bins=(20, 20), range=((40, 235), (8, 48)))
# Add a color bar to the histogram
plt.colorbar()
# Add labels, title, and display the plot
plt.xlabel('Horse power [hp]')
plt.ylabel('Miles per gallon [mpg]')
plt.title('hist2d() plot')
# plt.show()
plt.savefig("_dummyPy122.png", bbox_inches="tight")
plt.clf()
# Generate a 2d histogram with hexagonal bins
plt.hexbin(hp, mpg, gridsize=(15, 12), extent=(40, 235, 8, 48))
# Add a color bar to the histogram
plt.colorbar()
# Add labels, title, and display the plot
plt.xlabel('Horse power [hp]')
plt.ylabel('Miles per gallon [mpg]')
plt.title('hexbin() plot')
# plt.show()
plt.savefig("_dummyPy123.png", bbox_inches="tight")
plt.clf()
# Load the image into an array: img
# Downloaded Astrounaut-EVA.jpg from https://en.wikipedia.org/wiki/File:Astronaut-EVA.jpg
# img = plt.imread('480px-Astronaut-EVA.jpg')
# Cannot be read on my computer using regular Python but OK with Anaconda . . .
img = plt.imread(myPath + 'Astronaut-EVA.jpg')
# Print the shape of the image
print(img.shape)
# Display the image
plt.imshow(img)
# Hide the axes
plt.axis("off")
# plt.show()
plt.savefig("_dummyPy124.png", bbox_inches="tight")
plt.clf()
# Compute the sum of the red, green and blue channels: intensity
intensity = img.sum(axis=2)
# Print the shape of the intensity
print(intensity.shape)
# Display the intensity with a colormap of 'gray'
plt.imshow(intensity, cmap="gray")
# Add a colorbar
plt.colorbar()
# Hide the axes and show the figure
plt.axis('off')
# plt.show()
plt.savefig("_dummyPy125.png", bbox_inches="tight")
plt.clf()
# Specify the extent and aspect ratio of the top left subplot
plt.subplot(2,2,1)
plt.title('extent=(-1,1,-1,1),\naspect=0.5')
plt.xticks([-1,0,1])
plt.yticks([-1,0,1])
plt.imshow(img, extent=(-1,1,-1,1), aspect=0.5)
# Specify the extent and aspect ratio of the top right subplot
plt.subplot(2,2,2)
plt.title('extent=(-1,1,-1,1),\naspect=1')
plt.xticks([-1,0,1])
plt.yticks([-1,0,1])
plt.imshow(img, extent=(-1,1,-1,1), aspect=1)
# Specify the extent and aspect ratio of the bottom left subplot
plt.subplot(2,2,3)
plt.title('extent=(-1,1,-1,1),\naspect=2')
plt.xticks([-1,0,1])
plt.yticks([-1,0,1])
plt.imshow(img, extent=(-1,1,-1,1), aspect=2)
# Specify the extent and aspect ratio of the bottom right subplot
plt.subplot(2,2,4)
plt.title('extent=(-2,2,-1,1),\naspect=2')
plt.xticks([-2,-1,0,1,2])
plt.yticks([-1,0,1])
plt.imshow(img, extent=(-2,2,-1,1), aspect=2)
# Improve spacing and display the figure
plt.tight_layout()
# plt.show()
plt.savefig("_dummyPy126.png", bbox_inches="tight")
plt.clf()
# Downloaded Unequalized_Hawkes_Bay_NZ.jpg from https://commons.wikimedia.org/wiki/File:Unequalized_Hawkes_Bay_NZ.jpg
# Load the image into an array: image
# image = plt.imread('640px-Unequalized_Hawkes_Bay_NZ.jpg')
image = plt.imread(myPath + 'Unequalized_Hawkes_Bay_NZ.jpg')
# Extract minimum and maximum values from the image: pmin, pmax
pmin, pmax = image.min(), image.max()
print("The smallest & largest pixel intensities are %d & %d." % (pmin, pmax))
# Rescale the pixels: rescaled_image
imageMean = image.mean(axis=2)
rescaled_image = 256*(imageMean - pmin) / (pmax - pmin)
print("The rescaled smallest & largest pixel intensities are %.1f & %.1f." %
(rescaled_image.min(), rescaled_image.max()))
# Make it a 3D Numpy array for grayscale
# rescaled_gray = np.zeros((imageMean.shape[0], imageMean.shape[1], 3))
# rescaled_gray[:, :, 0] = rescaled_image
# rescaled_gray[:, :, 1] = rescaled_image
# rescaled_gray[:, :, 2] = rescaled_image
# Display the original image in the top subplot
plt.subplot(2,1,1)
plt.title('original image')
plt.axis('off')
plt.imshow(image)
# Display the rescaled image in the bottom subplot
plt.subplot(2,1,2)
plt.title('rescaled image')
plt.axis('off')
plt.imshow(rescaled_image, cmap="gray")
# plt.show()
plt.savefig("_dummyPy127.png", bbox_inches="tight")
plt.clf()
## (3072, 3072, 3)
## (3072, 3072)
## The smallest & largest pixel intensities are 114 & 208.
## The rescaled smallest & largest pixel intensities are 0.0 & 256.0.
Example #1: Pseudo-Color Plot:
Example #2: Pseudo-Color Contour Plot:
Example #3: Varying the Color Map:
Example #4: Heat Map for mtcars:
Example #5: Heat Map using hexbin for mtcars:
Example #6: Astronaut Image:
Example #7: Astronaut Image (GrayScale):
Example #8: Astronaut Image (Aspect Ratio):
Example #9: Hawkes Bay Image (Raw and Rescaled):
Chapter 3 - Statistical Plots with Seaborn (statistical data visualization package)
General background - designed by Michael Waskom (Stanford):
Visualizing Regressions - using the “tips” data and looking at “tip” vs. “total_bill”:
Visualizing univariate distributions - strip plots, swarm plots, violin plots:
Visualizing bivariate / multivariate distributions - joint plots, pair plots, heat maps:
Example code includes:
myPath = "./PythonInputFiles/"
# NEED TO BRING OVER "auto" data
import pandas as pd
auto = pd.read_csv(myPath + "mtcars.csv", index_col=0)
auto = auto[["mpg", "wt", "hp", "cyl", "am", "disp"]]
# Import plotting modules
import matplotlib.pyplot as plt
import seaborn as sns
# Plot a linear regression between 'weight' and 'hp'
sns.lmplot(x='wt', y='hp', data=auto)
# Display the plot
# plt.show()
plt.savefig("_dummyPy128.png", bbox_inches="tight")
plt.clf()
# Generate a green residual plot of the regression between 'hp' and 'mpg'
sns.residplot(x='hp', y='mpg', data=auto, color='green')
# Display the plot
# plt.show()
plt.savefig("_dummyPy129.png", bbox_inches="tight")
plt.clf()
# Generate a scatter plot of 'weight' and 'mpg' using red circles
plt.scatter(auto['wt'], auto["mpg"], label='data', color='red', marker='o')
# Plot in blue a linear regression of order 1 between 'weight' and 'mpg'
sns.regplot(x='wt', y='mpg', data=auto, color="blue", scatter=None, label='order 1')
# Plot in green a linear regression of order 2 between 'weight' and 'mpg'
sns.regplot(x='wt', y='mpg', data=auto, color="green", scatter=None, order=2, label='order 2')
# Add a legend and display the plot
plt.legend(loc="upper right")
# plt.show()
plt.savefig("_dummyPy130.png", bbox_inches="tight")
plt.clf()
# Plot a linear regression between 'weight' and 'hp', with a hue of 'cyl' and palette of 'Set1'
sns.lmplot(x="wt", y="hp", data=auto, hue="cyl", palette="Set1")
# Display the plot
# plt.show()
plt.savefig("_dummyPy131.png", bbox_inches="tight")
plt.clf()
# Plot linear regressions between 'weight' and 'hp' grouped row-wise by 'cyl'
sns.lmplot(x = "wt", y="hp", data=auto, row="cyl")
# Display the plot
# plt.show()
plt.savefig("_dummyPy132.png", bbox_inches="tight")
plt.clf()
# Make a strip plot of 'hp' grouped by 'cyl'
plt.subplot(2,1,1)
sns.stripplot(x="cyl", y="hp", data=auto)
# Make the strip plot again using jitter and a smaller point size
plt.subplot(2,1,2)
sns.stripplot(x="cyl", y="hp", data=auto, size=3, jitter=True)
# Display the plot
# plt.show()
plt.savefig("_dummyPy133.png", bbox_inches="tight")
plt.clf()
# Generate a swarm plot of 'hp' grouped horizontally by 'cyl'
plt.subplot(2,1,1)
sns.swarmplot(x="cyl", y="hp", data=auto)
# Generate a swarm plot of 'hp' grouped vertically by 'cyl' with a hue of 'am'
plt.subplot(2,1,2)
sns.swarmplot(y="cyl", x="hp", data=auto, hue="am", orient="h")
# Display the plot
# plt.show()
plt.savefig("_dummyPy134.png", bbox_inches="tight")
plt.clf()
# Generate a violin plot of 'hp' grouped horizontally by 'cyl'
plt.subplot(2,1,1)
sns.violinplot(x="cyl", y="hp", data=auto)
# Generate the same violin plot again with a color of 'lightgray' and without inner annotations
plt.subplot(2,1,2)
sns.violinplot(x="cyl", y="hp", data=auto, color="lightgray", inner=None)
# Overlay a strip plot on the violin plot
sns.stripplot(x="cyl", y="hp", data=auto, size=1.5, jitter=True)
# Display the plot
# plt.show()
plt.savefig("_dummyPy135.png", bbox_inches="tight")
plt.clf()
# Generate a joint plot of 'hp' and 'mpg'
sns.jointplot(x="hp", y="mpg", data=auto)
# Display the plot
# plt.show()
plt.savefig("_dummyPy136.png", bbox_inches="tight")
plt.clf()
# Generate a joint plot of 'hp' and 'mpg' using a hexbin plot
sns.jointplot(x="hp", y="mpg", data=auto, kind="hex")
# Display the plot
# plt.show()
plt.savefig("_dummyPy137.png", bbox_inches="tight")
plt.clf()
# Print the first 5 rows of the DataFrame
print(auto.head())
# Plot the pairwise joint distributions from the DataFrame
sns.pairplot(auto)
# Display the plot
# plt.show()
plt.savefig("_dummyPy138.png", bbox_inches="tight")
plt.clf()
# Plot the pairwise joint distributions grouped by 'am' along with regression lines
sns.pairplot(auto, hue="am", kind="reg")
# Display the plot
# plt.show()
plt.savefig("_dummyPy139.png", bbox_inches="tight")
plt.clf()
# NEED DATA - cov_matrix is 5x5 with mpg-hp-weight-accel-displ
# Print the covariance matrix
# print(cov_matrix)
# Visualize the covariance matrix using a heatmap
# sns.heatmap(cov_matrix)
# Display the heatmap
# plt.show()
## mpg wt hp cyl am disp
## Mazda RX4 21.0 2.620 110 6 1 160.0
## Mazda RX4 Wag 21.0 2.875 110 6 1 160.0
## Datsun 710 22.8 2.320 93 4 1 108.0
## Hornet 4 Drive 21.4 3.215 110 6 0 258.0
## Hornet Sportabout 18.7 3.440 175 8 0 360.0
Example #1: Seaborn Linear Regression Plot (sns.lmplot):
Example #2: Seaborn Residual Plot (sns.residplot):
Example #3: Labelled Plots (Raw Data, Order 1 Regression, Order 2 Regression):
Example #4: Regressions Stratified by Factor Using Colors:
Example #5: Regression Stratified by Factor (Separate Plots):
Example #6: Strip Plot:
Example #7: Swarm Plot:
Example #8: Violin Plot:
Example #9: Joint Plot (sns.jointplot):
Example #10: Joint Plot (Hexagonal):
Example #11: Pair Plot (sns.pairplot):
Example #12: Pair Plot Stratified Using Color (sns.pairplot):
Chapter 4 - Analyzing time series
Visualizing time series - example of the Austin 2010 weather data:
Time series with moving windows - taking a sample statistic (such as average or max/min) over a longer time period:
Histogram equilization in images - spreading out intensities so that subtle contrasts can be enhanced:
Example code includes:
myPath = "./PythonInputFiles/"
# Load the relevant stocks data
import pandas as pd
import numpy as np
from datetime import datetime
rawStocks = pd.read_csv(myPath + "StockChart_20170615.csv", header=None, index_col=None)
rawStocks["Date"] = [datetime.strptime(x.split()[0], "%m/%d/%Y") for x in rawStocks.iloc[:, 1]]
rawStocks["Price"] = [float(x.split()[1]) for x in rawStocks.iloc[:, 1]]
aapl = rawStocks.loc[rawStocks.iloc[:, 0] == "AAPL", ["Date", "Price"]].set_index("Date").sort_index()
goog = rawStocks.loc[rawStocks.iloc[:, 0] == "GOOG", ["Date", "Price"]].set_index("Date").sort_index()
ibm = rawStocks.loc[rawStocks.iloc[:, 0] == "IBM", ["Date", "Price"]].set_index("Date").sort_index()
# Import matplotlib.pyplot
import matplotlib.pyplot as plt
# Plot the aapl time series in blue
plt.plot(aapl, color="blue", label='AAPL')
# Plot the ibm time series in green
plt.plot(ibm, color='green', label='IBM')
# Plot the goog time series in red
plt.plot(goog, color='red', label='GOOG')
# Add a legend in the top left corner of the plot
plt.legend(loc='upper left')
# Specify the orientation of the xticks
plt.xticks(rotation=60)
# Display the plot
# plt.show()
plt.savefig("_dummyPy140.png", bbox_inches="tight")
plt.clf()
# Plot the series in the top subplot in blue
plt.subplot(2,1,1)
plt.xticks(rotation=45)
plt.title('AAPL: MAT June 2017')
plt.plot(aapl, color='blue')
# Slice aapl from '2017-01' to '2017-02' inclusive: view
view = aapl['2017-01':'2017-02']
# Plot the sliced data in the bottom subplot in black
plt.subplot(2,1,2)
plt.xticks(rotation=45)
plt.title('AAPL: 2017-01 to 2017-02')
plt.plot(view, color="black")
plt.tight_layout()
# plt.show()
plt.savefig("_dummyPy141.png", bbox_inches="tight")
plt.clf()
# Slice aapl from Nov. 2016 to Apr. 2017 inclusive: view
view = aapl['2016-11':'2017-04']
# Plot the sliced series in the top subplot in red
plt.subplot(2, 1, 1)
plt.plot(view, color="red")
plt.title('AAPL: Nov. 2016 to Apr. 2017')
plt.xticks(rotation=45)
# Reassign the series by slicing the month January 2017
view = aapl['2017-01']
# Plot the sliced series in the bottom subplot in green
plt.subplot(2, 1, 2)
plt.plot(view, color="green")
plt.title('AAPL: Jan. 2017')
plt.xticks(rotation=45)
# Improve spacing and display the plot
plt.tight_layout()
# plt.show()
plt.savefig("_dummyPy142.png", bbox_inches="tight")
plt.clf()
# Slice aapl from Nov. 2016 to Apr. 2017 inclusive: view
view = aapl['2016-11':'2017-04']
# Plot the entire series
plt.plot(aapl)
plt.xticks(rotation=45)
plt.title('AAPL: MAT June 2017')
# Specify the axes
plt.axes([0.25, 0.5, 0.35, 0.35])
# Plot the sliced series in red using the current axes
plt.plot(view, color="red")
plt.xticks(rotation=45)
plt.title('2016/11-2017/04')
# plt.show()
plt.savefig("_dummyPy143.png", bbox_inches="tight")
plt.clf()
# BASED OFF THE aapl DATASET
mean_10 = aapl.rolling(window=10).mean()
mean_30 = aapl.rolling(window=30).mean()
mean_75 = aapl.rolling(window=75).mean()
mean_125 = aapl.rolling(window=125).mean()
# Plot the 10-day moving average in the top left subplot in green
plt.subplot(2, 2, 1)
plt.plot(mean_10, color="green")
plt.plot(aapl, 'k-.')
plt.xticks(rotation=60)
plt.title('10d averages')
# Plot the 30-day moving average in the top right subplot in red
plt.subplot(2, 2, 2)
plt.plot(mean_30, 'red')
plt.plot(aapl, 'k-.')
plt.xticks(rotation=60)
plt.title('30d averages')
# Plot the 75-day moving average in the bottom left subplot in magenta
plt.subplot(2, 2, 3)
plt.plot(mean_75, color="magenta")
plt.plot(aapl, 'k-.')
plt.xticks(rotation=60)
plt.title('75d averages')
# Plot the 125-day moving average in the bottom right subplot in cyan
plt.subplot(2, 2, 4)
plt.plot(mean_125, color="cyan")
plt.plot(aapl, 'k-.')
plt.xticks(rotation=60)
plt.title('125d averages')
# Display the plot
# plt.show()
plt.savefig("_dummyPy144.png", bbox_inches="tight")
plt.clf()
std_10 = aapl.rolling(window=10).std()
std_30 = aapl.rolling(window=30).std()
std_75 = aapl.rolling(window=75).std()
std_125 = aapl.rolling(window=125).std()
# Plot std_10 in red
plt.plot(std_10, color="red", label='10d')
# Plot std_30 in cyan
plt.plot(std_30, color="cyan", label='30d')
# Plot std_75 in green
plt.plot(std_75, color="green", label='75d')
# Plot std_125 in magenta
plt.plot(std_125, color="magenta", label='125d')
# Add a legend to the upper left
plt.legend(loc="upper left")
# Add a title
plt.title('Moving standard deviations')
# Display the plot
# plt.show()
plt.savefig("_dummyPy145.png", bbox_inches="tight")
plt.clf()
# IMAGE AVAILABLE AT https://commons.wikimedia.org/wiki/File:Unequalized_Hawkes_Bay_NZ.jpg
# Load the image into an array, keeping just one of the RGB layers: image
image = plt.imread(myPath + 'Unequalized_Hawkes_Bay_NZ.jpg')[:, :, 0]
# Display image in top subplot using color map 'gray'
plt.subplot(2,1,1)
plt.title('Original image')
plt.axis('off')
plt.imshow(image, cmap="gray")
# Flatten the image into 1 dimension: pixels
pixels = image.flatten()
# Display a histogram of the pixels in the bottom subplot
plt.subplot(2,1,2)
plt.xlim((0,255))
plt.title('Normalized histogram')
plt.hist(pixels, bins=64, range=(0, 256), normed=True, color="red", alpha=0.4)
# Display the plot
# plt.show()
plt.savefig("_dummyPy146.png", bbox_inches="tight")
plt.clf()
# Load the image into an array: image
image = plt.imread(myPath + 'Unequalized_Hawkes_Bay_NZ.jpg')[:, :, 0]
# Display image in top subplot using color map 'gray'
plt.subplot(2,1,1)
plt.imshow(image, cmap='gray')
plt.title('Original image')
plt.axis('off')
# Flatten the image into 1 dimension: pixels
pixels = image.flatten()
# Display a histogram of the pixels in the bottom subplot
plt.subplot(2,1,2)
pdf = plt.hist(pixels, bins=64, range=(0,256), normed=False,
color='red', alpha=0.4)
plt.grid('off')
# Use plt.twinx() to overlay the CDF in the bottom subplot
plt.twinx()
# Display a cumulative histogram of the pixels
cdf = plt.hist(pixels, bins=64, range=(0,256),
normed=True, cumulative=True,
color='blue', alpha=0.4)
# Specify x-axis range, hide axes, add title and display plot
plt.xlim((0,256))
plt.grid('off')
plt.title('PDF & CDF (original image)')
# plt.show()
plt.savefig("_dummyPy147.png", bbox_inches="tight")
plt.clf()
# Load the image into an array: image
image = plt.imread(myPath + 'Unequalized_Hawkes_Bay_NZ.jpg')[:, :, 0]
# Flatten the image into 1 dimension: pixels
pixels = image.flatten()
# Generate a cumulative histogram
cdf, bins, patches = plt.hist(pixels, bins=256, range=(0,256), normed=True, cumulative=True)
new_pixels = np.interp(pixels, bins[:-1], cdf*255)
# Reshape new_pixels as a 2-D array: new_image
new_image = new_pixels.reshape(image.shape)
# Display the new image with 'gray' color map
plt.subplot(2,1,1)
plt.title('Equalized image')
plt.axis('off')
plt.imshow(new_image, cmap="gray")
# Generate a histogram of the new pixels
plt.subplot(2,1,2)
pdf = plt.hist(new_pixels, bins=64, range=(0,256), normed=False,
color='red', alpha=0.4)
plt.grid('off')
# Use plt.twinx() to overlay the CDF in the bottom subplot
plt.twinx()
plt.xlim((0,256))
plt.grid('off')
# Add title
plt.title('PDF & CDF (equalized image)')
# Generate a cumulative histogram of the new pixels
cdf = plt.hist(new_pixels, bins=64, range=(0,256),
cumulative=True, normed=True,
color='blue', alpha=0.4)
# plt.show()
plt.savefig("_dummyPy148.png", bbox_inches="tight")
plt.clf()
# NEXT IMAGE AVAILABLE AT http://imgsrc.hubblesite.org/hu/db/images/hs-2004-32-b-small_web.jpg
# Load the image into an array: image
# image = plt.imread('hs-2004-32-b-small_web.jpg')
# Display image in top subplot
# plt.subplot(2,1,1)
# plt.title('Original image')
# plt.axis('off')
# plt.imshow(image)
# Extract 2-D arrays of the RGB channels: red, blue, green
# red, green, blue = image[:,:,0], image[:,:,1], image[:,:,2]
# Flatten the 2-D arrays of the RGB channels into 1-D
# red_pixels = red.flatten()
# blue_pixels = blue.flatten()
# green_pixels = green.flatten()
# Overlay histograms of the pixels of each color in the bottom subplot
# plt.subplot(2,1,2)
# plt.title('Histograms from color image')
# plt.xlim((0,256))
# plt.hist(red_pixels, bins=64, normed=True, color='red', alpha = 0.2)
# plt.hist(blue_pixels, bins=64, normed=True, color='blue', alpha = 0.2)
# plt.hist(green_pixels, bins=64, normed=True, color='green', alpha = 0.2)
# Display the plot
# plt.show()
# Load the image into an array: image
# image = plt.imread('hs-2004-32-b-small_web.jpg')
# Extract RGB channels and flatten into 1-D array
# red, blue, green = image[:,:,0], image[:,:,1], image[:,:,2]
# red_pixels = red.flatten()
# blue_pixels = blue.flatten()
# green_pixels = green.flatten()
# Generate a 2-D histogram of the red and green pixels
# plt.subplot(2,2,1)
# plt.grid('off')
# plt.xticks(rotation=60)
# plt.xlabel('red')
# plt.ylabel('green')
# plt.hist2d(x=red_pixels, y=green_pixels, bins=(32, 32))
# Generate a 2-D histogram of the green and blue pixels
# plt.subplot(2,2,2)
# plt.grid('off')
# plt.xticks(rotation=60)
# plt.xlabel('green')
# plt.ylabel('blue')
# plt.hist2d(x=green_pixels, y=blue_pixels, bins=(32, 32))
# Generate a 2-D histogram of the blue and red pixels
# plt.subplot(2,2,3)
# plt.grid('off')
# plt.xticks(rotation=60)
# plt.xlabel('blue')
# plt.ylabel('red')
# plt.hist2d(x=blue_pixels, y=red_pixels, bins=(32, 32))
# Display the plot
# plt.show()
Example #1: Multiple Time Series on a Single Plot:
Example #2: Multiple Time Series on Separate Sub-Plots:
Example #3: Multiple Time Series on Separate Sub-Plots:
Example #4: Multiple Time Series as Callout on Single Main Plot:
Example #5: Rolling Mean Stock Prices (AAPL 10-d, 30-d, 75-d, 125-d):
Example #6: Rolling Standard Deviation of Stock Prices (AAPL 10-d, 30-d, 75-d, 125-d):
Example #7: Grayscale Image and Pixel Histogram:
Example #8: Grayscale Image and Pixel CDF/PDF:
Example #9: Original Grayscale Image and Normalized Grayscale Image:
Chapter 1 - Basic plotting with Bokeh
Plotting with glyphs - visual shapes that can be drawn to the screen (line, points, rectangles, etc.):
Additional glyphs - available by default in Bokeh:
Data formats - can pass lists, NumPy arrays, etc. as inputs to the glyphs:
Customizing glyphs - actions in response to hovering, user clicks, etc.:
Example code includes:
myPath = "./PythonInputFiles/"
# Import figure from bokeh.plotting
from bokeh.plotting import figure
# Import output_file and show from bokeh.io
from bokeh.io import output_file, show
import pandas as pd
rawGap = pd.read_csv(myPath + "literacy_birth_rate.csv", index_col=None)
fertility = rawGap["fertility"]
female_literacy = rawGap["female literacy"]
# Create the figure: p
p = figure(x_axis_label='fertility (children per woman)', y_axis_label='female_literacy (% population)')
# Add a circle glyph to the figure p
p.circle(fertility, female_literacy)
# Call the output_file() function and specify the name of the file
output_file(myPath + "fert_lit.html")
# Display the plot
# show(p)
# Create the figure: p
p = figure(x_axis_label='fertility', y_axis_label='female_literacy (% population)')
fertility_latinamerica = fertility[rawGap["Continent"] == "LAT"]
female_literacy_latinamerica = female_literacy[rawGap["Continent"] == "LAT"]
# Add a circle glyph to the figure p
p.circle(fertility_latinamerica, female_literacy_latinamerica)
fertility_africa = fertility[rawGap["Continent"] == "AF"]
female_literacy_africa = female_literacy[rawGap["Continent"] == "AF"]
# Add an x glyph to the figure p
p.x(fertility_africa, female_literacy_africa)
# Specify the name of the file
output_file(myPath + 'fert_lit_separate.html')
# Display the plot
# show(p)
# Create the figure: p
p = figure(x_axis_label='fertility (children per woman)', y_axis_label='female_literacy (% population)')
# Add a blue circle glyph to the figure p
p.circle(fertility_latinamerica, female_literacy_latinamerica, color="blue", size=10, alpha=0.8)
# Add a red circle glyph to the figure p
p.circle(fertility_africa, female_literacy_africa, color="red", size=10, alpha=0.8)
# Specify the name of the file
output_file(myPath + 'fert_lit_separate_colors.html')
# Display the plot
# show(p)
# AAPL share price - 2000 to 2014
from datetime import datetime
aaplRaw = pd.read_csv(myPath + "aapl_2000_2014.csv", index_col=0)
date = [datetime.strptime(x, "%Y-%m-%d") for x in aaplRaw["date"]]
price = aaplRaw["adj_close"]
# Import figure from bokeh.plotting
from bokeh.plotting import figure
# Create a figure with x_axis_type="datetime": p
p = figure(x_axis_type="datetime", x_axis_label='Date', y_axis_label='US Dollars')
# Plot date along the x axis and price along the y axis
p.line(date, price)
# Specify the name of the output file and show the result
output_file(myPath + 'line.html')
# show(p)
# Import figure from bokeh.plotting
from bokeh.plotting import figure
# Create a figure with x_axis_type='datetime': p
p = figure(x_axis_type='datetime', x_axis_label='Date', y_axis_label='US Dollars')
# Plot date along the x-axis and price along the y-axis
p.line(date, price)
# With date on the x-axis and price on the y-axis, add a white circle glyph of size 4
p.circle(date, price, fill_color="white", size=4)
# Specify the name of the output file and show the result
output_file(myPath + 'line.html')
# show(p)
# Create a list of az_lons, co_lons, nm_lons and ut_lons: x
# x = [az_lons, co_lons, nm_lons, ut_lons]
# Create a list of az_lats, co_lats, nm_lats and ut_lats: y
# y = [az_lats, co_lats, nm_lats, ut_lats]
# Add patches to figure p with line_color=white for x and y
# p.patches(x, y, line_color="white")
# Specify the name of the output file and show the result
# output_file('four_corners.html')
# show(p)
# Import numpy as np
import numpy as np
# Create array using np.linspace: x
x = np.linspace(0, 5, 100)
# Create array using np.cos: y
y = np.cos(x)
# Add circles at x and y
p.circle(x, y)
# Specify the name of the output file and show the result
output_file(myPath + 'numpy.html')
# show(p)
# Import pandas as pd
import pandas as pd
# Read in the CSV file: df
df = pd.read_csv(myPath + "auto-mpg.csv")
# Import figure from bokeh.plotting
from bokeh.plotting import figure
# Create the figure: p
p = figure(x_axis_label='HP', y_axis_label='MPG')
# Plot mpg vs hp by color
p.circle(df["hp"], df["mpg"], size=10, color=df["color"])
# Specify the name of the output file and show the result
output_file(myPath + 'auto-df.html')
# show(p)
# Import the ColumnDataSource class from bokeh.plotting
from bokeh.plotting import ColumnDataSource
# Create a ColumnDataSource from df: source
source = ColumnDataSource(df)
# Add circle glyphs to the figure p
p.circle("yr", "accel", source=source, color="color", size=8)
# Specify the name of the output file and show the result
output_file(myPath + 'sprint.html')
# show(p)
# Create a figure with the "box_select" tool: p
p = figure(x_axis_label="Year", y_axis_label="Accel", tools="box_select")
# Add circle glyphs to the figure p with the selected and non-selected properties
p.circle("yr", "accel", source=source, selection_color="red", nonselection_alpha=0.1)
# Specify the name of the output file and show the result
output_file(myPath + 'selection_glyph.html')
# show(p)
# import the HoverTool
# from bokeh.models import HoverTool
# Add circle glyphs to figure p
# p.circle(x, y, size=10,
# fill_color="grey", alpha=0.1, line_color=None,
# hover_fill_color="firebrick", hover_alpha=0.5,
# hover_line_color="white")
# Create a HoverTool: hover
# hover = HoverTool(tooltips=None, mode="vline")
# Add the hover tool to the figure p
# p.add_tools(hover)
# Specify the name of the output file and show the result
# output_file(myPath + 'hover_glyph.html')
# show(p)
#Import CategoricalColorMapper from bokeh.models
from bokeh.models import CategoricalColorMapper
# Convert df to a ColumnDataSource: source
source = ColumnDataSource(df)
# Make a CategoricalColorMapper object: color_mapper
color_mapper = CategoricalColorMapper(factors=['Europe', 'Asia', 'US'],
palette=['red', 'green', 'blue'])
# Add a circle glyph to the figure p
p.circle("weight", 'mpg', source=source,
color=dict(field='origin', transform=color_mapper),
legend='origin')
# Specify the name of the output file and show the result
output_file(myPath + 'colormap.html')
# show(p)
Chapter 2 - Layouts, Interactions, and Annotations
Introduction to Layouts - annotations, links across plots, etc.:
Advanced Layouts - continuing with gridded layouts and tabbed layouts:
Linking plots together - for example, keeping the ranges of two plots synchronized (even when panning or zooming occurs):
Annotations and guides - better communicate findings from the data:
Example code includes:
myPath = "./PythonInputFiles/"
# Import row from bokeh.layouts
from bokeh.layouts import row
from bokeh.plotting import ColumnDataSource, figure
from bokeh.io import output_file, show
import pandas as pd
rawGap = pd.read_csv(myPath + "literacy_birth_rate.csv", index_col=None)
rawGap.columns = ["Country", "Continent", "female_literacy", "fertility", "population"]
source = ColumnDataSource(rawGap)
# Create the first figure: p1
p1 = figure(x_axis_label='fertility (children per woman)', y_axis_label='female_literacy (% population)')
# Add a circle glyph to p1
p1.circle("fertility", "female_literacy", source=source)
# Create the second figure: p2
p2 = figure(x_axis_label='population', y_axis_label='female_literacy (% population)')
# Add a circle glyph to p2
p2.circle("population", "female_literacy", source=source)
# Put p1 and p2 into a horizontal row: layout
layout = row(p1, p2)
# Specify the name of the output_file and show the result
output_file(myPath + 'fert_row.html')
# show(layout)
# Import column from the bokeh.layouts module
from bokeh.layouts import column
# Create a blank figure: p1
p1 = figure(x_axis_label='fertility (children per woman)', y_axis_label='female_literacy (% population)')
# Add circle scatter to the figure p1
p1.circle('fertility', 'female_literacy', source=source)
# Create a new blank figure: p2
p2 = figure(x_axis_label="population", y_axis_label='female_literacy (% population)')
# Add circle scatter to the figure p2
p2.circle("population", "female_literacy", source=source)
# Put plots p1 and p2 in a column: layout
layout = column(p1, p2)
# Specify the name of the output_file and show the result
output_file(myPath + 'fert_column.html')
# show(layout)
# Import column and row from bokeh.layouts
from bokeh.layouts import row, column
# Make a column layout that will be used as the second row: row2
# row2 = column([mpg_hp, mpg_weight], sizing_mode='scale_width')
# Make a row layout that includes the above column layout: layout
# layout = row([avg_mpg, row2], sizing_mode='scale_width')
# Specify the name of the output_file and show the result
# output_file(myPath + 'layout_custom.html')
# show(layout)
# Import gridplot from bokeh.layouts
from bokeh.layouts import gridplot
# Create a list containing plots p1 and p2: row1
# row1 = [p1, p2]
# Create a list containing plots p3 and p4: row2
# row2 = [p3, p4]
# Create a gridplot using row1 and row2: layout
# layout = gridplot([row1, row2])
# Specify the name of the output_file and show the result
# output_file(myPath + 'grid.html')
# show(layout)
# Import Panel from bokeh.models.widgets
from bokeh.models.widgets import Panel
# Create a blank figure: p1
source = ColumnDataSource(rawGap.loc[rawGap["Continent"] == "LAT", :])
p1 = figure(x_axis_label='fertility (children per woman)', y_axis_label='female_literacy (% population)')
p1.circle('fertility', 'female_literacy', source=source)
source = ColumnDataSource(rawGap.loc[rawGap["Continent"] == "AF", :])
p2 = figure(x_axis_label='fertility (children per woman)', y_axis_label='female_literacy (% population)')
p2.circle('fertility', 'female_literacy', source=source)
source = ColumnDataSource(rawGap.loc[rawGap["Continent"] == "ASI", :])
p3 = figure(x_axis_label='fertility (children per woman)', y_axis_label='female_literacy (% population)')
p3.circle('fertility', 'female_literacy', source=source)
source = ColumnDataSource(rawGap.loc[rawGap["Continent"] == "EUR", :])
p4 = figure(x_axis_label='fertility (children per woman)', y_axis_label='female_literacy (% population)')
p4.circle('fertility', 'female_literacy', source=source)
source = ColumnDataSource(rawGap)
# Create tab1 from plot p1: tab1
tab1 = Panel(child=p1, title='Latin America')
# Create tab2 from plot p2: tab2
tab2 = Panel(child=p2, title='Africa')
# Create tab3 from plot p3: tab3
tab3 = Panel(child=p3, title='Asia')
# Create tab4 from plot p4: tab4
tab4 = Panel(child=p4, title='Europe')
# Import Tabs from bokeh.models.widgets
from bokeh.models.widgets import Tabs
# Create a Tabs layout: layout
layout = Tabs(tabs=[tab1, tab2, tab3, tab4])
# Specify the name of the output_file and show the result
output_file(myPath + 'tabs.html')
# show(layout)
# Link the x_range of p2 to p1: p2.x_range
p2.x_range = p1.x_range
# Link the y_range of p2 to p1: p2.y_range
p2.y_range = p1.y_range
# Link the x_range of p3 to p1: p3.x_range
p3.x_range = p1.x_range
# Link the y_range of p4 to p1: p4.y_range
p4.y_range = p1.y_range
# Specify the name of the output_file and show the result
output_file(myPath + 'linked_range.html')
# show(layout)
# Create ColumnDataSource: source
source = ColumnDataSource(rawGap)
# Create the first figure: p1
p1 = figure(x_axis_label='fertility (children per woman)', y_axis_label='female literacy (% population)',
tools="box_select,lasso_select")
# Add a circle glyph to p1
p1.circle("fertility", "female_literacy", source=source)
# Create the second figure: p2
p2 = figure(x_axis_label='fertility (children per woman)', y_axis_label='population (millions)',
tools="box_select,lasso_select")
# Add a circle glyph to p2
p2.circle("fertility", "population", source=source)
# Create row layout of figures p1 and p2: layout
layout = row(p1, p2)
# Specify the name of the output_file and show the result
output_file(myPath + 'linked_brush.html')
# show(layout)
# Add the first circle glyph to the figure p
# p.circle('fertility', 'female_literacy', source=latin_america, size=10, color="red", legend="Latin America")
# Add the second circle glyph to the figure p
# p.circle('fertility', 'female_literacy', source=africa, size=10, color="blue", legend="Africa")
# Specify the name of the output_file and show the result
# output_file(myPath + 'fert_lit_groups.html')
# show(p)
# Assign the legend to the bottom left: p.legend.location
# p.legend.location = "bottom_left"
# Fill the legend background with the color 'lightgray': p.legend.background_fill_color
# p.legend.background_fill_color = "lightgray"
# Specify the name of the output_file and show the result
# output_file(myPath + 'fert_lit_groups.html')
# show(p)
# Import HoverTool from bokeh.models
from bokeh.models import HoverTool
# Create a HoverTool object: hover
# hover = HoverTool(tooltips=[('Country','@Country')])
# Add the HoverTool object to figure p
# p.add_tools(hover)
# Specify the name of the output_file and show the result
# output_file(myPath + 'hover.html')
# show(p)
Chapter 3 - High-Level Charts
Pre-set interfaces to simlify chart design for common graphs like Histograms, Box-plots and Scatter-plots:
Histograms - start with “from bokeh.charts import Histogram”:
BoxPlots - start with “from bokeh.charts import BoxPlot”:
Scatter Plots - main advantage relative to using glyphs as per previous chapters is certain automation of grouping options:
Example code includes:
myPath = "./PythonInputFiles/"
from bokeh.layouts import row
from bokeh.plotting import ColumnDataSource, figure
from bokeh.io import output_file, show
import pandas as pd
rawGap = pd.read_csv(myPath + "literacy_birth_rate.csv", index_col=None)
rawGap.columns = ["Country", "Continent", "female_literacy", "fertility", "population"]
source = ColumnDataSource(rawGap)
# Import Histogram, output_file, and show from bokeh.charts
from bokeh.charts import Histogram, output_file, show
# Make a Histogram: p
p = Histogram(rawGap, "female_literacy", title="Female Literacy")
# Set the x axis label
p.xaxis.axis_label = ""
# Set the y axis label
p.yaxis.axis_label = ""
# Specify the name of the output_file and show the result
output_file(myPath + "histogram.html")
# show(p)
# Import Histogram, output_file, and show from bokeh.charts
from bokeh.charts import Histogram, output_file, show
# Make the Histogram: p
p = Histogram(rawGap, "female_literacy", title='Female Literacy', bins=40)
# Set axis labels
p.xaxis.axis_label = 'Female Literacy (% population)'
p.yaxis.axis_label = 'Number of Countries'
# Specify the name of the output_file and show the result
output_file(myPath + 'histogram.html')
# show(p)
# Import Histogram, output_file, and show from bokeh.charts
from bokeh.charts import Histogram, output_file, show
# Make a Histogram: p
p = Histogram(rawGap, "female_literacy", title='Female Literacy',
color="Continent", legend="top_left")
# Set axis labels
p.xaxis.axis_label = 'Female Literacy (% population)'
p.yaxis.axis_label = 'Number of Countries'
# Specify the name of the output_file and show the result
output_file(myPath + 'hist_bins.html')
# show(p)
# Import BoxPlot, output_file, and show from bokeh.charts
from bokeh.charts import BoxPlot, output_file, show
# Make a box plot: p
p = BoxPlot(rawGap, values="female_literacy", label="Continent",
title='Female Literacy (grouped by Continent)', legend='bottom_right')
# Set the y axis label
p.yaxis.axis_label = 'Female literacy (% population)'
# Specify the name of the output_file and show the result
output_file(myPath + 'boxplot.html')
# show(p)
# Import BoxPlot, output_file, and show
from bokeh.charts import BoxPlot, output_file, show
# Make a box plot: p
p = BoxPlot(rawGap, values="female_literacy", label='Continent', color="Continent",
title='Female Literacy (grouped by Continent)', legend="bottom_right")
# Set y-axis label
p.yaxis.axis_label = 'Female literacy (% population)'
# Specify the name of the output_file and show the result
output_file(myPath + 'boxplot.html')
# show(p)
# Import Scatter, output_file, and show from bokeh.charts
from bokeh.charts import Scatter, output_file, show
# Make a scatter plot: p
p = Scatter(rawGap, x="population", y="female_literacy",
title='Female Literacy vs Population')
# Set the x-axis label
p.xaxis.axis_label = "Population"
# Set the y-axis label
p.yaxis.axis_label = "Female Literacy"
# Specify the name of the output_file and show the result
output_file(myPath + 'scatterplot.html')
# show(p)
# Import Scatter, output_file, and show from bokeh.charts
from bokeh.charts import Scatter, output_file, show
# Make a scatter plot such that each circle is colored by its continent: p
p = Scatter(rawGap, x="population", y="female_literacy", color="Continent",
title='Female Literacy vs Population')
# Set x-axis and y-axis labels
p.xaxis.axis_label = 'Population (millions)'
p.yaxis.axis_label = 'Female literacy (% population)'
# Specify the name of the output_file and show the result
output_file(myPath + 'scatterplot.html')
# show(p)
# Import Scatter, output_file, and show from bokeh.charts
from bokeh.charts import Scatter, output_file, show
# Make a scatter plot such that each continent has a different marker type: p
p = Scatter(rawGap, x="population", y="female_literacy", color="Continent", marker="Continent", title="Female Literacy vs. Population")
# Set x-axis and y-axis labels
p.xaxis.axis_label = 'Population (millions)'
p.yaxis.axis_label = 'Female literacy (% population)'
# Specify the name of the output_file and show the result
output_file(myPath + 'scatterplot.html')
# show(p)